8 8 9 36 39 40 19 3 2 1 7 3 5 62 52 10 24 1 24 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/array_size.h> #include <linux/atomic.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kobject.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/dsfield.h> #include <net/genetlink.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_iv_ogm.h" #include "bat_v.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" #include "log.h" #include "multicast.h" #include "netlink.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" #include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, * list traversals just rcu-locked */ struct list_head batadv_hardif_list; unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct workqueue_struct *batadv_event_workqueue; static void batadv_recv_handler_init(void); #define BATADV_UEV_TYPE_VAR "BATTYPE=" #define BATADV_UEV_ACTION_VAR "BATACTION=" #define BATADV_UEV_DATA_VAR "BATDATA=" static char *batadv_uev_action_str[] = { "add", "del", "change", "loopdetect", }; static char *batadv_uev_type_str[] = { "gw", "bla", }; static int __init batadv_init(void) { int ret; ret = batadv_tt_cache_init(); if (ret < 0) return ret; INIT_LIST_HEAD(&batadv_hardif_list); batadv_algo_init(); batadv_recv_handler_init(); batadv_v_init(); batadv_iv_init(); batadv_nc_init(); batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); if (!batadv_event_workqueue) goto err_create_wq; register_netdevice_notifier(&batadv_hard_if_notifier); rtnl_link_register(&batadv_link_ops); batadv_netlink_register(); pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n", BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); return 0; err_create_wq: batadv_tt_cache_destroy(); return -ENOMEM; } static void __exit batadv_exit(void) { batadv_netlink_unregister(); rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); destroy_workqueue(batadv_event_workqueue); batadv_event_workqueue = NULL; rcu_barrier(); batadv_tt_cache_destroy(); } /** * batadv_mesh_init() - Initialize soft interface * @soft_iface: netdev struct of the soft interface * * Return: 0 on success or negative error number in case of failure */ int batadv_mesh_init(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); int ret; spin_lock_init(&bat_priv->forw_bat_list_lock); spin_lock_init(&bat_priv->forw_bcast_list_lock); spin_lock_init(&bat_priv->tt.changes_list_lock); spin_lock_init(&bat_priv->tt.req_list_lock); spin_lock_init(&bat_priv->tt.roam_list_lock); spin_lock_init(&bat_priv->tt.last_changeset_lock); spin_lock_init(&bat_priv->tt.commit_lock); spin_lock_init(&bat_priv->gw.list_lock); #ifdef CONFIG_BATMAN_ADV_MCAST spin_lock_init(&bat_priv->mcast.mla_lock); spin_lock_init(&bat_priv->mcast.want_lists_lock); #endif spin_lock_init(&bat_priv->tvlv.container_list_lock); spin_lock_init(&bat_priv->tvlv.handler_list_lock); spin_lock_init(&bat_priv->softif_vlan_list_lock); spin_lock_init(&bat_priv->tp_list_lock); INIT_HLIST_HEAD(&bat_priv->forw_bat_list); INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); INIT_HLIST_HEAD(&bat_priv->gw.gateway_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list); INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list); INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list); #endif INIT_LIST_HEAD(&bat_priv->tt.changes_list); INIT_HLIST_HEAD(&bat_priv->tt.req_list); INIT_LIST_HEAD(&bat_priv->tt.roam_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.mla_list); #endif INIT_HLIST_HEAD(&bat_priv->tvlv.container_list); INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); INIT_HLIST_HEAD(&bat_priv->tp_list); bat_priv->gw.generation = 0; ret = batadv_originator_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_orig; } ret = batadv_tt_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_tt; } ret = batadv_v_mesh_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_v; } ret = batadv_bla_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_bla; } ret = batadv_dat_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_dat; } ret = batadv_nc_mesh_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_nc; } batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); atomic_set(&bat_priv->gw.reselect, 0); atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE); return 0; err_nc: batadv_dat_free(bat_priv); err_dat: batadv_bla_free(bat_priv); err_bla: batadv_v_mesh_free(bat_priv); err_v: batadv_tt_free(bat_priv); err_tt: batadv_originator_free(bat_priv); err_orig: batadv_purge_outstanding_packets(bat_priv, NULL); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); return ret; } /** * batadv_mesh_free() - Deinitialize soft interface * @soft_iface: netdev struct of the soft interface */ void batadv_mesh_free(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); batadv_purge_outstanding_packets(bat_priv, NULL); batadv_gw_node_free(bat_priv); batadv_v_mesh_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); batadv_mcast_free(bat_priv); /* Free the TT and the originator tables only after having terminated * all the other depending components which may use these structures for * their purposes. */ batadv_tt_free(bat_priv); /* Since the originator table clean up routine is accessing the TT * tables as well, it has to be invoked after the TT tables have been * freed and marked as empty. This ensures that no cleanup RCU callbacks * accessing the TT data are scheduled for later execution. */ batadv_originator_free(bat_priv); batadv_gw_free(bat_priv); free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); } /** * batadv_is_my_mac() - check if the given mac address belongs to any of the * real interfaces in the current mesh * @bat_priv: the bat priv with all the soft interface information * @addr: the address to check * * Return: 'true' if the mac address was found, false otherwise. */ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { const struct batadv_hard_iface *hard_iface; bool is_my_mac = false; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { is_my_mac = true; break; } } rcu_read_unlock(); return is_my_mac; } /** * batadv_max_header_len() - calculate maximum encapsulation overhead for a * payload packet * * Return: the maximum encapsulation overhead in bytes. */ int batadv_max_header_len(void) { int header_len = 0; header_len = max_t(int, header_len, sizeof(struct batadv_unicast_packet)); header_len = max_t(int, header_len, sizeof(struct batadv_unicast_4addr_packet)); header_len = max_t(int, header_len, sizeof(struct batadv_bcast_packet)); #ifdef CONFIG_BATMAN_ADV_NC header_len = max_t(int, header_len, sizeof(struct batadv_coded_packet)); #endif return header_len + ETH_HLEN; } /** * batadv_skb_set_priority() - sets skb priority according to packet content * @skb: the packet to be sent * @offset: offset to the packet content * * This function sets a value between 256 and 263 (802.1d priority), which * can be interpreted by the cfg80211 or other drivers. */ void batadv_skb_set_priority(struct sk_buff *skb, int offset) { struct iphdr ip_hdr_tmp, *ip_hdr; struct ipv6hdr ip6_hdr_tmp, *ip6_hdr; struct ethhdr ethhdr_tmp, *ethhdr; struct vlan_ethhdr *vhdr, vhdr_tmp; u32 prio; /* already set, do nothing */ if (skb->priority >= 256 && skb->priority <= 263) return; ethhdr = skb_header_pointer(skb, offset, sizeof(*ethhdr), ðhdr_tmp); if (!ethhdr) return; switch (ethhdr->h_proto) { case htons(ETH_P_8021Q): vhdr = skb_header_pointer(skb, offset + sizeof(*vhdr), sizeof(*vhdr), &vhdr_tmp); if (!vhdr) return; prio = ntohs(vhdr->h_vlan_TCI) & VLAN_PRIO_MASK; prio = prio >> VLAN_PRIO_SHIFT; break; case htons(ETH_P_IP): ip_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), sizeof(*ip_hdr), &ip_hdr_tmp); if (!ip_hdr) return; prio = (ipv4_get_dsfield(ip_hdr) & 0xfc) >> 5; break; case htons(ETH_P_IPV6): ip6_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), sizeof(*ip6_hdr), &ip6_hdr_tmp); if (!ip6_hdr) return; prio = (ipv6_get_dsfield(ip6_hdr) & 0xfc) >> 5; break; default: return; } skb->priority = prio + 256; } static int batadv_recv_unhandled_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { kfree_skb(skb); return NET_RX_DROP; } /* incoming packets with the batman ethertype received on any active hard * interface */ /** * batadv_batman_skb_recv() - Handle incoming message from an hard interface * @skb: the received packet * @dev: the net device that the packet was received on * @ptype: packet type of incoming packet (ETH_P_BATMAN) * @orig_dev: the original receive net device (e.g. bonded device) * * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure */ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { struct batadv_priv *bat_priv; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *hard_iface; u8 idx; hard_iface = container_of(ptype, struct batadv_hard_iface, batman_adv_ptype); /* Prevent processing a packet received on an interface which is getting * shut down otherwise the packet may trigger de-reference errors * further down in the receive path. */ if (!kref_get_unless_zero(&hard_iface->refcount)) goto err_out; skb = skb_share_check(skb, GFP_ATOMIC); /* skb was released by skb_share_check() */ if (!skb) goto err_put; /* packet should hold at least type and version */ if (unlikely(!pskb_may_pull(skb, 2))) goto err_free; /* expect a valid ethernet header here. */ if (unlikely(skb->mac_len != ETH_HLEN || !skb_mac_header(skb))) goto err_free; if (!hard_iface->soft_iface) goto err_free; bat_priv = netdev_priv(hard_iface->soft_iface); if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto err_free; /* discard frames on not active interfaces */ if (hard_iface->if_status != BATADV_IF_ACTIVE) goto err_free; batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data; if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: incompatible batman version (%i)\n", batadv_ogm_packet->version); goto err_free; } /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); idx = batadv_ogm_packet->packet_type; (*batadv_rx_handler[idx])(skb, hard_iface); batadv_hardif_put(hard_iface); /* return NET_RX_SUCCESS in any case as we * most probably dropped the packet for * routing-logical reasons. */ return NET_RX_SUCCESS; err_free: kfree_skb(skb); err_put: batadv_hardif_put(hard_iface); err_out: return NET_RX_DROP; } static void batadv_recv_handler_init(void) { int i; for (i = 0; i < ARRAY_SIZE(batadv_rx_handler); i++) batadv_rx_handler[i] = batadv_recv_unhandled_packet; for (i = BATADV_UNICAST_MIN; i <= BATADV_UNICAST_MAX; i++) batadv_rx_handler[i] = batadv_recv_unhandled_unicast_packet; /* compile time checks for sizes */ BUILD_BUG_ON(sizeof(struct batadv_bla_claim_dst) != 6); BUILD_BUG_ON(sizeof(struct batadv_ogm_packet) != 24); BUILD_BUG_ON(sizeof(struct batadv_icmp_header) != 20); BUILD_BUG_ON(sizeof(struct batadv_icmp_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_icmp_packet_rr) != 116); BUILD_BUG_ON(sizeof(struct batadv_unicast_packet) != 10); BUILD_BUG_ON(sizeof(struct batadv_unicast_4addr_packet) != 18); BUILD_BUG_ON(sizeof(struct batadv_frag_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_bcast_packet) != 14); BUILD_BUG_ON(sizeof(struct batadv_coded_packet) != 46); BUILD_BUG_ON(sizeof(struct batadv_unicast_tvlv_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_tvlv_hdr) != 4); BUILD_BUG_ON(sizeof(struct batadv_tvlv_gateway_data) != 8); BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_vlan_data) != 8); BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12); BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8); i = sizeof_field(struct sk_buff, cb); BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i); /* broadcast packet */ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet; /* multicast packet */ batadv_rx_handler[BATADV_MCAST] = batadv_recv_mcast_packet; /* unicast packets ... */ /* unicast with 4 addresses packet */ batadv_rx_handler[BATADV_UNICAST_4ADDR] = batadv_recv_unicast_packet; /* unicast packet */ batadv_rx_handler[BATADV_UNICAST] = batadv_recv_unicast_packet; /* unicast tvlv packet */ batadv_rx_handler[BATADV_UNICAST_TVLV] = batadv_recv_unicast_tvlv; /* batman icmp packet */ batadv_rx_handler[BATADV_ICMP] = batadv_recv_icmp_packet; /* Fragmented packets */ batadv_rx_handler[BATADV_UNICAST_FRAG] = batadv_recv_frag_packet; } /** * batadv_recv_handler_register() - Register handler for batman-adv packet type * @packet_type: batadv_packettype which should be handled * @recv_handler: receive handler for the packet type * * Return: 0 on success or negative error number in case of failure */ int batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { int (*curr)(struct sk_buff *skb, struct batadv_hard_iface *recv_if); curr = batadv_rx_handler[packet_type]; if (curr != batadv_recv_unhandled_packet && curr != batadv_recv_unhandled_unicast_packet) return -EBUSY; batadv_rx_handler[packet_type] = recv_handler; return 0; } /** * batadv_recv_handler_unregister() - Unregister handler for packet type * @packet_type: batadv_packettype which should no longer be handled */ void batadv_recv_handler_unregister(u8 packet_type) { batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } /** * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in * the header * @skb: skb pointing to fragmented socket buffers * @payload_ptr: Pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed * * payload_ptr must always point to an address in the skb head buffer and not to * a fragment. * * Return: big endian crc32c of the checksummed data */ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) { u32 crc = 0; unsigned int from; unsigned int to = skb->len; struct skb_seq_state st; const u8 *data; unsigned int len; unsigned int consumed = 0; from = (unsigned int)(payload_ptr - skb->data); skb_prepare_seq_read(skb, from, to, &st); while ((len = skb_seq_read(consumed, &data, &st)) != 0) { crc = crc32c(crc, data, len); consumed += len; } return htonl(crc); } /** * batadv_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header * * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the * skb is vlan tagged. Otherwise BATADV_NO_FLAGS. */ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) { struct ethhdr *ethhdr = (struct ethhdr *)(skb->data + header_len); struct vlan_ethhdr *vhdr; unsigned short vid; if (ethhdr->h_proto != htons(ETH_P_8021Q)) return BATADV_NO_FLAGS; if (!pskb_may_pull(skb, header_len + VLAN_ETH_HLEN)) return BATADV_NO_FLAGS; vhdr = (struct vlan_ethhdr *)(skb->data + header_len); vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; vid |= BATADV_VLAN_HAS_TAG; return vid; } /** * batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) { bool ap_isolation_enabled = false; struct batadv_softif_vlan *vlan; /* if the AP isolation is requested on a VLAN, then check for its * setting in the proper VLAN private data structure */ vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { ap_isolation_enabled = atomic_read(&vlan->ap_isolation); batadv_softif_vlan_put(vlan); } return ap_isolation_enabled; } /** * batadv_throw_uevent() - Send an uevent with batman-adv specific env data * @bat_priv: the bat priv with all the soft interface information * @type: subsystem type of event. Stored in uevent's BATTYPE * @action: action type of event. Stored in uevent's BATACTION * @data: string with additional information to the event (ignored for * BATADV_UEV_DEL). Stored in uevent's BATDATA * * Return: 0 on success or negative error number in case of failure */ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data) { int ret = -ENOMEM; struct kobject *bat_kobj; char *uevent_env[4] = { NULL, NULL, NULL, NULL }; bat_kobj = &bat_priv->soft_iface->dev.kobj; uevent_env[0] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_TYPE_VAR, batadv_uev_type_str[type]); if (!uevent_env[0]) goto report_error; uevent_env[1] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_ACTION_VAR, batadv_uev_action_str[action]); if (!uevent_env[1]) goto free_first_env; /* If the event is DEL, ignore the data field */ if (action != BATADV_UEV_DEL) { uevent_env[2] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_DATA_VAR, data); if (!uevent_env[2]) goto free_second_env; } ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); kfree(uevent_env[2]); free_second_env: kfree(uevent_env[1]); free_first_env: kfree(uevent_env[0]); if (ret) report_error: batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", batadv_uev_type_str[type], batadv_uev_action_str[action], (action == BATADV_UEV_DEL ? "NULL" : data), ret); return ret; } module_init(batadv_init); module_exit(batadv_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR(BATADV_DRIVER_AUTHOR); MODULE_DESCRIPTION(BATADV_DRIVER_DESC); MODULE_VERSION(BATADV_SOURCE_VERSION); MODULE_ALIAS_RTNL_LINK("batadv"); MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME); |
98 11 248 37 37 87 87 202 42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __BEN_VLAN_802_1Q_INC__ #define __BEN_VLAN_802_1Q_INC__ #include <linux/if_vlan.h> #include <linux/u64_stats_sync.h> #include <linux/list.h> /* if this changes, algorithm will have to be reworked because this * depends on completely exhausting the VLAN identifier space. Thus * it gives constant time look-up, but in many cases it wastes memory. */ #define VLAN_GROUP_ARRAY_SPLIT_PARTS 8 #define VLAN_GROUP_ARRAY_PART_LEN (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS) enum vlan_protos { VLAN_PROTO_8021Q = 0, VLAN_PROTO_8021AD, VLAN_PROTO_NUM, }; struct vlan_group { unsigned int nr_vlan_devs; struct hlist_node hlist; /* linked list */ struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM] [VLAN_GROUP_ARRAY_SPLIT_PARTS]; }; struct vlan_info { struct net_device *real_dev; /* The ethernet(like) device * the vlan is attached to. */ struct vlan_group grp; struct list_head vid_list; unsigned int nr_vids; struct rcu_head rcu; }; static inline int vlan_proto_idx(__be16 proto) { switch (proto) { case htons(ETH_P_8021Q): return VLAN_PROTO_8021Q; case htons(ETH_P_8021AD): return VLAN_PROTO_8021AD; default: WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto)); return -EINVAL; } } static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg, unsigned int pidx, u16 vlan_id) { struct net_device **array; array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; /* paired with smp_wmb() in vlan_group_prealloc_vid() */ smp_rmb(); return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL; } static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { int pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return NULL; return __vlan_group_get_device(vg, pidx, vlan_id); } static inline void vlan_group_set_device(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id, struct net_device *dev) { int pidx = vlan_proto_idx(vlan_proto); struct net_device **array; if (!vg || pidx < 0) return; array = vg->vlan_devices_arrays[pidx] [vlan_id / VLAN_GROUP_ARRAY_PART_LEN]; array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev; } /* Must be invoked with rcu_read_lock or with RTNL. */ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id) { struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info); if (vlan_info) return vlan_group_get_device(&vlan_info->grp, vlan_proto, vlan_id); return NULL; } static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev) { netdev_features_t ret; ret = real_dev->hw_enc_features & (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL); if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK)) return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM; return 0; } #define vlan_group_for_each_dev(grp, i, dev) \ for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \ if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ (i) % VLAN_N_VID))) int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto); void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto); /* found in vlan_dev.c */ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); void vlan_dev_free_egress_priority(const struct net_device *dev); int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack); void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); static inline u32 vlan_get_ingress_priority(struct net_device *dev, u16 vlan_tci) { struct vlan_dev_priv *vip = vlan_dev_priv(dev); return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7]; } #ifdef CONFIG_VLAN_8021Q_GVRP int vlan_gvrp_request_join(const struct net_device *dev); void vlan_gvrp_request_leave(const struct net_device *dev); int vlan_gvrp_init_applicant(struct net_device *dev); void vlan_gvrp_uninit_applicant(struct net_device *dev); int vlan_gvrp_init(void); void vlan_gvrp_uninit(void); #else static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; } static inline void vlan_gvrp_request_leave(const struct net_device *dev) {} static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; } static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {} static inline int vlan_gvrp_init(void) { return 0; } static inline void vlan_gvrp_uninit(void) {} #endif #ifdef CONFIG_VLAN_8021Q_MVRP int vlan_mvrp_request_join(const struct net_device *dev); void vlan_mvrp_request_leave(const struct net_device *dev); int vlan_mvrp_init_applicant(struct net_device *dev); void vlan_mvrp_uninit_applicant(struct net_device *dev); int vlan_mvrp_init(void); void vlan_mvrp_uninit(void); #else static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; } static inline void vlan_mvrp_request_leave(const struct net_device *dev) {} static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; } static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {} static inline int vlan_mvrp_init(void) { return 0; } static inline void vlan_mvrp_uninit(void) {} #endif extern const char vlan_fullname[]; extern const char vlan_version[]; int vlan_netlink_init(void); void vlan_netlink_fini(void); extern struct rtnl_link_ops vlan_link_ops; extern unsigned int vlan_net_id; struct proc_dir_entry; struct vlan_net { /* /proc/net/vlan */ struct proc_dir_entry *proc_vlan_dir; /* /proc/net/vlan/config */ struct proc_dir_entry *proc_vlan_conf; /* Determines interface naming scheme. */ unsigned short name_type; }; #endif /* !(__BEN_VLAN_802_1Q_INC__) */ |
8 8 3 3 14 14 14 14 6 6 6 6 111 111 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 | /* * Copyright (c) 2014 Samsung Electronics Co., Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sub license, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include <linux/err.h> #include <linux/media-bus-format.h> #include <linux/module.h> #include <linux/mutex.h> #include <drm/drm_atomic_state_helper.h> #include <drm/drm_bridge.h> #include <drm/drm_debugfs.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_file.h> #include <drm/drm_of.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" /** * DOC: overview * * &struct drm_bridge represents a device that hangs on to an encoder. These are * handy when a regular &drm_encoder entity isn't enough to represent the entire * encoder chain. * * A bridge is always attached to a single &drm_encoder at a time, but can be * either connected to it directly, or through a chain of bridges:: * * [ CRTC ---> ] Encoder ---> Bridge A ---> Bridge B * * Here, the output of the encoder feeds to bridge A, and that furthers feeds to * bridge B. Bridge chains can be arbitrarily long, and shall be fully linear: * Chaining multiple bridges to the output of a bridge, or the same bridge to * the output of different bridges, is not supported. * * &drm_bridge, like &drm_panel, aren't &drm_mode_object entities like planes, * CRTCs, encoders or connectors and hence are not visible to userspace. They * just provide additional hooks to get the desired output at the end of the * encoder chain. */ /** * DOC: display driver integration * * Display drivers are responsible for linking encoders with the first bridge * in the chains. This is done by acquiring the appropriate bridge with * devm_drm_of_get_bridge(). Once acquired, the bridge shall be attached to the * encoder with a call to drm_bridge_attach(). * * Bridges are responsible for linking themselves with the next bridge in the * chain, if any. This is done the same way as for encoders, with the call to * drm_bridge_attach() occurring in the &drm_bridge_funcs.attach operation. * * Once these links are created, the bridges can participate along with encoder * functions to perform mode validation and fixup (through * drm_bridge_chain_mode_valid() and drm_atomic_bridge_chain_check()), mode * setting (through drm_bridge_chain_mode_set()), enable (through * drm_atomic_bridge_chain_pre_enable() and drm_atomic_bridge_chain_enable()) * and disable (through drm_atomic_bridge_chain_disable() and * drm_atomic_bridge_chain_post_disable()). Those functions call the * corresponding operations provided in &drm_bridge_funcs in sequence for all * bridges in the chain. * * For display drivers that use the atomic helpers * drm_atomic_helper_check_modeset(), * drm_atomic_helper_commit_modeset_enables() and * drm_atomic_helper_commit_modeset_disables() (either directly in hand-rolled * commit check and commit tail handlers, or through the higher-level * drm_atomic_helper_check() and drm_atomic_helper_commit_tail() or * drm_atomic_helper_commit_tail_rpm() helpers), this is done transparently and * requires no intervention from the driver. For other drivers, the relevant * DRM bridge chain functions shall be called manually. * * Bridges also participate in implementing the &drm_connector at the end of * the bridge chain. Display drivers may use the drm_bridge_connector_init() * helper to create the &drm_connector, or implement it manually on top of the * connector-related operations exposed by the bridge (see the overview * documentation of bridge operations for more details). */ /** * DOC: special care dsi * * The interaction between the bridges and other frameworks involved in * the probing of the upstream driver and the bridge driver can be * challenging. Indeed, there's multiple cases that needs to be * considered: * * - The upstream driver doesn't use the component framework and isn't a * MIPI-DSI host. In this case, the bridge driver will probe at some * point and the upstream driver should try to probe again by returning * EPROBE_DEFER as long as the bridge driver hasn't probed. * * - The upstream driver doesn't use the component framework, but is a * MIPI-DSI host. The bridge device uses the MIPI-DCS commands to be * controlled. In this case, the bridge device is a child of the * display device and when it will probe it's assured that the display * device (and MIPI-DSI host) is present. The upstream driver will be * assured that the bridge driver is connected between the * &mipi_dsi_host_ops.attach and &mipi_dsi_host_ops.detach operations. * Therefore, it must run mipi_dsi_host_register() in its probe * function, and then run drm_bridge_attach() in its * &mipi_dsi_host_ops.attach hook. * * - The upstream driver uses the component framework and is a MIPI-DSI * host. The bridge device uses the MIPI-DCS commands to be * controlled. This is the same situation than above, and can run * mipi_dsi_host_register() in either its probe or bind hooks. * * - The upstream driver uses the component framework and is a MIPI-DSI * host. The bridge device uses a separate bus (such as I2C) to be * controlled. In this case, there's no correlation between the probe * of the bridge and upstream drivers, so care must be taken to avoid * an endless EPROBE_DEFER loop, with each driver waiting for the * other to probe. * * The ideal pattern to cover the last item (and all the others in the * MIPI-DSI host driver case) is to split the operations like this: * * - The MIPI-DSI host driver must run mipi_dsi_host_register() in its * probe hook. It will make sure that the MIPI-DSI host sticks around, * and that the driver's bind can be called. * * - In its probe hook, the bridge driver must try to find its MIPI-DSI * host, register as a MIPI-DSI device and attach the MIPI-DSI device * to its host. The bridge driver is now functional. * * - In its &struct mipi_dsi_host_ops.attach hook, the MIPI-DSI host can * now add its component. Its bind hook will now be called and since * the bridge driver is attached and registered, we can now look for * and attach it. * * At this point, we're now certain that both the upstream driver and * the bridge driver are functional and we can't have a deadlock-like * situation when probing. */ /** * DOC: dsi bridge operations * * DSI host interfaces are expected to be implemented as bridges rather than * encoders, however there are a few aspects of their operation that need to * be defined in order to provide a consistent interface. * * A DSI host should keep the PHY powered down until the pre_enable operation is * called. All lanes are in an undefined idle state up to this point, and it * must not be assumed that it is LP-11. * pre_enable should initialise the PHY, set the data lanes to LP-11, and the * clock lane to either LP-11 or HS depending on the mode_flag * %MIPI_DSI_CLOCK_NON_CONTINUOUS. * * Ordinarily the downstream bridge DSI peripheral pre_enable will have been * called before the DSI host. If the DSI peripheral requires LP-11 and/or * the clock lane to be in HS mode prior to pre_enable, then it can set the * &pre_enable_prev_first flag to request the pre_enable (and * post_disable) order to be altered to enable the DSI host first. * * Either the CRTC being enabled, or the DSI host enable operation should switch * the host to actively transmitting video on the data lanes. * * The reverse also applies. The DSI host disable operation or stopping the CRTC * should stop transmitting video, and the data lanes should return to the LP-11 * state. The DSI host &post_disable operation should disable the PHY. * If the &pre_enable_prev_first flag is set, then the DSI peripheral's * bridge &post_disable will be called before the DSI host's post_disable. * * Whilst it is valid to call &host_transfer prior to pre_enable or after * post_disable, the exact state of the lanes is undefined at this point. The * DSI host should initialise the interface, transmit the data, and then disable * the interface again. * * Ultra Low Power State (ULPS) is not explicitly supported by DRM. If * implemented, it therefore needs to be handled entirely within the DSI Host * driver. */ static DEFINE_MUTEX(bridge_lock); static LIST_HEAD(bridge_list); /** * drm_bridge_add - add the given bridge to the global bridge list * * @bridge: bridge control structure */ void drm_bridge_add(struct drm_bridge *bridge) { mutex_init(&bridge->hpd_mutex); mutex_lock(&bridge_lock); list_add_tail(&bridge->list, &bridge_list); mutex_unlock(&bridge_lock); } EXPORT_SYMBOL(drm_bridge_add); static void drm_bridge_remove_void(void *bridge) { drm_bridge_remove(bridge); } /** * devm_drm_bridge_add - devm managed version of drm_bridge_add() * * @dev: device to tie the bridge lifetime to * @bridge: bridge control structure * * This is the managed version of drm_bridge_add() which automatically * calls drm_bridge_remove() when @dev is unbound. * * Return: 0 if no error or negative error code. */ int devm_drm_bridge_add(struct device *dev, struct drm_bridge *bridge) { drm_bridge_add(bridge); return devm_add_action_or_reset(dev, drm_bridge_remove_void, bridge); } EXPORT_SYMBOL(devm_drm_bridge_add); /** * drm_bridge_remove - remove the given bridge from the global bridge list * * @bridge: bridge control structure */ void drm_bridge_remove(struct drm_bridge *bridge) { mutex_lock(&bridge_lock); list_del_init(&bridge->list); mutex_unlock(&bridge_lock); mutex_destroy(&bridge->hpd_mutex); } EXPORT_SYMBOL(drm_bridge_remove); static struct drm_private_state * drm_bridge_atomic_duplicate_priv_state(struct drm_private_obj *obj) { struct drm_bridge *bridge = drm_priv_to_bridge(obj); struct drm_bridge_state *state; state = bridge->funcs->atomic_duplicate_state(bridge); return state ? &state->base : NULL; } static void drm_bridge_atomic_destroy_priv_state(struct drm_private_obj *obj, struct drm_private_state *s) { struct drm_bridge_state *state = drm_priv_to_bridge_state(s); struct drm_bridge *bridge = drm_priv_to_bridge(obj); bridge->funcs->atomic_destroy_state(bridge, state); } static const struct drm_private_state_funcs drm_bridge_priv_state_funcs = { .atomic_duplicate_state = drm_bridge_atomic_duplicate_priv_state, .atomic_destroy_state = drm_bridge_atomic_destroy_priv_state, }; /** * drm_bridge_attach - attach the bridge to an encoder's chain * * @encoder: DRM encoder * @bridge: bridge to attach * @previous: previous bridge in the chain (optional) * @flags: DRM_BRIDGE_ATTACH_* flags * * Called by a kms driver to link the bridge to an encoder's chain. The previous * argument specifies the previous bridge in the chain. If NULL, the bridge is * linked directly at the encoder's output. Otherwise it is linked at the * previous bridge's output. * * If non-NULL the previous bridge must be already attached by a call to this * function. * * Note that bridges attached to encoders are auto-detached during encoder * cleanup in drm_encoder_cleanup(), so drm_bridge_attach() should generally * *not* be balanced with a drm_bridge_detach() in driver code. * * RETURNS: * Zero on success, error code on failure */ int drm_bridge_attach(struct drm_encoder *encoder, struct drm_bridge *bridge, struct drm_bridge *previous, enum drm_bridge_attach_flags flags) { int ret; if (!encoder || !bridge) return -EINVAL; if (previous && (!previous->dev || previous->encoder != encoder)) return -EINVAL; if (bridge->dev) return -EBUSY; bridge->dev = encoder->dev; bridge->encoder = encoder; if (previous) list_add(&bridge->chain_node, &previous->chain_node); else list_add(&bridge->chain_node, &encoder->bridge_chain); if (bridge->funcs->attach) { ret = bridge->funcs->attach(bridge, flags); if (ret < 0) goto err_reset_bridge; } if (bridge->funcs->atomic_reset) { struct drm_bridge_state *state; state = bridge->funcs->atomic_reset(bridge); if (IS_ERR(state)) { ret = PTR_ERR(state); goto err_detach_bridge; } drm_atomic_private_obj_init(bridge->dev, &bridge->base, &state->base, &drm_bridge_priv_state_funcs); } return 0; err_detach_bridge: if (bridge->funcs->detach) bridge->funcs->detach(bridge); err_reset_bridge: bridge->dev = NULL; bridge->encoder = NULL; list_del(&bridge->chain_node); if (ret != -EPROBE_DEFER) DRM_ERROR("failed to attach bridge %pOF to encoder %s: %d\n", bridge->of_node, encoder->name, ret); else dev_err_probe(encoder->dev->dev, -EPROBE_DEFER, "failed to attach bridge %pOF to encoder %s\n", bridge->of_node, encoder->name); return ret; } EXPORT_SYMBOL(drm_bridge_attach); void drm_bridge_detach(struct drm_bridge *bridge) { if (WARN_ON(!bridge)) return; if (WARN_ON(!bridge->dev)) return; if (bridge->funcs->atomic_reset) drm_atomic_private_obj_fini(&bridge->base); if (bridge->funcs->detach) bridge->funcs->detach(bridge); list_del(&bridge->chain_node); bridge->dev = NULL; } /** * DOC: bridge operations * * Bridge drivers expose operations through the &drm_bridge_funcs structure. * The DRM internals (atomic and CRTC helpers) use the helpers defined in * drm_bridge.c to call bridge operations. Those operations are divided in * three big categories to support different parts of the bridge usage. * * - The encoder-related operations support control of the bridges in the * chain, and are roughly counterparts to the &drm_encoder_helper_funcs * operations. They are used by the legacy CRTC and the atomic modeset * helpers to perform mode validation, fixup and setting, and enable and * disable the bridge automatically. * * The enable and disable operations are split in * &drm_bridge_funcs.pre_enable, &drm_bridge_funcs.enable, * &drm_bridge_funcs.disable and &drm_bridge_funcs.post_disable to provide * finer-grained control. * * Bridge drivers may implement the legacy version of those operations, or * the atomic version (prefixed with atomic\_), in which case they shall also * implement the atomic state bookkeeping operations * (&drm_bridge_funcs.atomic_duplicate_state, * &drm_bridge_funcs.atomic_destroy_state and &drm_bridge_funcs.reset). * Mixing atomic and non-atomic versions of the operations is not supported. * * - The bus format negotiation operations * &drm_bridge_funcs.atomic_get_output_bus_fmts and * &drm_bridge_funcs.atomic_get_input_bus_fmts allow bridge drivers to * negotiate the formats transmitted between bridges in the chain when * multiple formats are supported. Negotiation for formats is performed * transparently for display drivers by the atomic modeset helpers. Only * atomic versions of those operations exist, bridge drivers that need to * implement them shall thus also implement the atomic version of the * encoder-related operations. This feature is not supported by the legacy * CRTC helpers. * * - The connector-related operations support implementing a &drm_connector * based on a chain of bridges. DRM bridges traditionally create a * &drm_connector for bridges meant to be used at the end of the chain. This * puts additional burden on bridge drivers, especially for bridges that may * be used in the middle of a chain or at the end of it. Furthermore, it * requires all operations of the &drm_connector to be handled by a single * bridge, which doesn't always match the hardware architecture. * * To simplify bridge drivers and make the connector implementation more * flexible, a new model allows bridges to unconditionally skip creation of * &drm_connector and instead expose &drm_bridge_funcs operations to support * an externally-implemented &drm_connector. Those operations are * &drm_bridge_funcs.detect, &drm_bridge_funcs.get_modes, * &drm_bridge_funcs.get_edid, &drm_bridge_funcs.hpd_notify, * &drm_bridge_funcs.hpd_enable and &drm_bridge_funcs.hpd_disable. When * implemented, display drivers shall create a &drm_connector instance for * each chain of bridges, and implement those connector instances based on * the bridge connector operations. * * Bridge drivers shall implement the connector-related operations for all * the features that the bridge hardware support. For instance, if a bridge * supports reading EDID, the &drm_bridge_funcs.get_edid shall be * implemented. This however doesn't mean that the DDC lines are wired to the * bridge on a particular platform, as they could also be connected to an I2C * controller of the SoC. Support for the connector-related operations on the * running platform is reported through the &drm_bridge.ops flags. Bridge * drivers shall detect which operations they can support on the platform * (usually this information is provided by ACPI or DT), and set the * &drm_bridge.ops flags for all supported operations. A flag shall only be * set if the corresponding &drm_bridge_funcs operation is implemented, but * an implemented operation doesn't necessarily imply that the corresponding * flag will be set. Display drivers shall use the &drm_bridge.ops flags to * decide which bridge to delegate a connector operation to. This mechanism * allows providing a single static const &drm_bridge_funcs instance in * bridge drivers, improving security by storing function pointers in * read-only memory. * * In order to ease transition, bridge drivers may support both the old and * new models by making connector creation optional and implementing the * connected-related bridge operations. Connector creation is then controlled * by the flags argument to the drm_bridge_attach() function. Display drivers * that support the new model and create connectors themselves shall set the * %DRM_BRIDGE_ATTACH_NO_CONNECTOR flag, and bridge drivers shall then skip * connector creation. For intermediate bridges in the chain, the flag shall * be passed to the drm_bridge_attach() call for the downstream bridge. * Bridge drivers that implement the new model only shall return an error * from their &drm_bridge_funcs.attach handler when the * %DRM_BRIDGE_ATTACH_NO_CONNECTOR flag is not set. New display drivers * should use the new model, and convert the bridge drivers they use if * needed, in order to gradually transition to the new model. */ /** * drm_bridge_chain_mode_valid - validate the mode against all bridges in the * encoder chain. * @bridge: bridge control structure * @info: display info against which the mode shall be validated * @mode: desired mode to be validated * * Calls &drm_bridge_funcs.mode_valid for all the bridges in the encoder * chain, starting from the first bridge to the last. If at least one bridge * does not accept the mode the function returns the error code. * * Note: the bridge passed should be the one closest to the encoder. * * RETURNS: * MODE_OK on success, drm_mode_status Enum error code on failure */ enum drm_mode_status drm_bridge_chain_mode_valid(struct drm_bridge *bridge, const struct drm_display_info *info, const struct drm_display_mode *mode) { struct drm_encoder *encoder; if (!bridge) return MODE_OK; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { enum drm_mode_status ret; if (!bridge->funcs->mode_valid) continue; ret = bridge->funcs->mode_valid(bridge, info, mode); if (ret != MODE_OK) return ret; } return MODE_OK; } EXPORT_SYMBOL(drm_bridge_chain_mode_valid); /** * drm_bridge_chain_mode_set - set proposed mode for all bridges in the * encoder chain * @bridge: bridge control structure * @mode: desired mode to be set for the encoder chain * @adjusted_mode: updated mode that works for this encoder chain * * Calls &drm_bridge_funcs.mode_set op for all the bridges in the * encoder chain, starting from the first bridge to the last. * * Note: the bridge passed should be the one closest to the encoder */ void drm_bridge_chain_mode_set(struct drm_bridge *bridge, const struct drm_display_mode *mode, const struct drm_display_mode *adjusted_mode) { struct drm_encoder *encoder; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { if (bridge->funcs->mode_set) bridge->funcs->mode_set(bridge, mode, adjusted_mode); } } EXPORT_SYMBOL(drm_bridge_chain_mode_set); /** * drm_atomic_bridge_chain_disable - disables all bridges in the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_disable (falls back on * &drm_bridge_funcs.disable) op for all the bridges in the encoder chain, * starting from the last bridge to the first. These are called before calling * &drm_encoder_helper_funcs.atomic_disable * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_disable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; struct drm_bridge *iter; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_reverse(iter, &encoder->bridge_chain, chain_node) { if (iter->funcs->atomic_disable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, iter); if (WARN_ON(!old_bridge_state)) return; iter->funcs->atomic_disable(iter, old_bridge_state); } else if (iter->funcs->disable) { iter->funcs->disable(iter); } if (iter == bridge) break; } } EXPORT_SYMBOL(drm_atomic_bridge_chain_disable); static void drm_atomic_bridge_call_post_disable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { if (old_state && bridge->funcs->atomic_post_disable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, bridge); if (WARN_ON(!old_bridge_state)) return; bridge->funcs->atomic_post_disable(bridge, old_bridge_state); } else if (bridge->funcs->post_disable) { bridge->funcs->post_disable(bridge); } } /** * drm_atomic_bridge_chain_post_disable - cleans up after disabling all bridges * in the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_post_disable (falls back on * &drm_bridge_funcs.post_disable) op for all the bridges in the encoder chain, * starting from the first bridge to the last. These are called after completing * &drm_encoder_helper_funcs.atomic_disable * * If a bridge sets @pre_enable_prev_first, then the @post_disable for that * bridge will be called before the previous one to reverse the @pre_enable * calling direction. * * Example: * Bridge A ---> Bridge B ---> Bridge C ---> Bridge D ---> Bridge E * * With pre_enable_prev_first flag enable in Bridge B, D, E then the resulting * @post_disable order would be, * Bridge B, Bridge A, Bridge E, Bridge D, Bridge C. * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_post_disable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; struct drm_bridge *next, *limit; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { limit = NULL; if (!list_is_last(&bridge->chain_node, &encoder->bridge_chain)) { next = list_next_entry(bridge, chain_node); if (next->pre_enable_prev_first) { /* next bridge had requested that prev * was enabled first, so disabled last */ limit = next; /* Find the next bridge that has NOT requested * prev to be enabled first / disabled last */ list_for_each_entry_from(next, &encoder->bridge_chain, chain_node) { if (!next->pre_enable_prev_first) { next = list_prev_entry(next, chain_node); limit = next; break; } if (list_is_last(&next->chain_node, &encoder->bridge_chain)) { limit = next; break; } } /* Call these bridges in reverse order */ list_for_each_entry_from_reverse(next, &encoder->bridge_chain, chain_node) { if (next == bridge) break; drm_atomic_bridge_call_post_disable(next, old_state); } } } drm_atomic_bridge_call_post_disable(bridge, old_state); if (limit) /* Jump all bridges that we have already post_disabled */ bridge = limit; } } EXPORT_SYMBOL(drm_atomic_bridge_chain_post_disable); static void drm_atomic_bridge_call_pre_enable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { if (old_state && bridge->funcs->atomic_pre_enable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, bridge); if (WARN_ON(!old_bridge_state)) return; bridge->funcs->atomic_pre_enable(bridge, old_bridge_state); } else if (bridge->funcs->pre_enable) { bridge->funcs->pre_enable(bridge); } } /** * drm_atomic_bridge_chain_pre_enable - prepares for enabling all bridges in * the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_pre_enable (falls back on * &drm_bridge_funcs.pre_enable) op for all the bridges in the encoder chain, * starting from the last bridge to the first. These are called before calling * &drm_encoder_helper_funcs.atomic_enable * * If a bridge sets @pre_enable_prev_first, then the pre_enable for the * prev bridge will be called before pre_enable of this bridge. * * Example: * Bridge A ---> Bridge B ---> Bridge C ---> Bridge D ---> Bridge E * * With pre_enable_prev_first flag enable in Bridge B, D, E then the resulting * @pre_enable order would be, * Bridge C, Bridge D, Bridge E, Bridge A, Bridge B. * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_pre_enable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; struct drm_bridge *iter, *next, *limit; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_reverse(iter, &encoder->bridge_chain, chain_node) { if (iter->pre_enable_prev_first) { next = iter; limit = bridge; list_for_each_entry_from_reverse(next, &encoder->bridge_chain, chain_node) { if (next == bridge) break; if (!next->pre_enable_prev_first) { /* Found first bridge that does NOT * request prev to be enabled first */ limit = next; break; } } list_for_each_entry_from(next, &encoder->bridge_chain, chain_node) { /* Call requested prev bridge pre_enable * in order. */ if (next == iter) /* At the first bridge to request prev * bridges called first. */ break; drm_atomic_bridge_call_pre_enable(next, old_state); } } drm_atomic_bridge_call_pre_enable(iter, old_state); if (iter->pre_enable_prev_first) /* Jump all bridges that we have already pre_enabled */ iter = limit; if (iter == bridge) break; } } EXPORT_SYMBOL(drm_atomic_bridge_chain_pre_enable); /** * drm_atomic_bridge_chain_enable - enables all bridges in the encoder chain * @bridge: bridge control structure * @old_state: old atomic state * * Calls &drm_bridge_funcs.atomic_enable (falls back on * &drm_bridge_funcs.enable) op for all the bridges in the encoder chain, * starting from the first bridge to the last. These are called after completing * &drm_encoder_helper_funcs.atomic_enable * * Note: the bridge passed should be the one closest to the encoder */ void drm_atomic_bridge_chain_enable(struct drm_bridge *bridge, struct drm_atomic_state *old_state) { struct drm_encoder *encoder; if (!bridge) return; encoder = bridge->encoder; list_for_each_entry_from(bridge, &encoder->bridge_chain, chain_node) { if (bridge->funcs->atomic_enable) { struct drm_bridge_state *old_bridge_state; old_bridge_state = drm_atomic_get_old_bridge_state(old_state, bridge); if (WARN_ON(!old_bridge_state)) return; bridge->funcs->atomic_enable(bridge, old_bridge_state); } else if (bridge->funcs->enable) { bridge->funcs->enable(bridge); } } } EXPORT_SYMBOL(drm_atomic_bridge_chain_enable); static int drm_atomic_bridge_check(struct drm_bridge *bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state) { if (bridge->funcs->atomic_check) { struct drm_bridge_state *bridge_state; int ret; bridge_state = drm_atomic_get_new_bridge_state(crtc_state->state, bridge); if (WARN_ON(!bridge_state)) return -EINVAL; ret = bridge->funcs->atomic_check(bridge, bridge_state, crtc_state, conn_state); if (ret) return ret; } else if (bridge->funcs->mode_fixup) { if (!bridge->funcs->mode_fixup(bridge, &crtc_state->mode, &crtc_state->adjusted_mode)) return -EINVAL; } return 0; } static int select_bus_fmt_recursive(struct drm_bridge *first_bridge, struct drm_bridge *cur_bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state, u32 out_bus_fmt) { unsigned int i, num_in_bus_fmts = 0; struct drm_bridge_state *cur_state; struct drm_bridge *prev_bridge; u32 *in_bus_fmts; int ret; prev_bridge = drm_bridge_get_prev_bridge(cur_bridge); cur_state = drm_atomic_get_new_bridge_state(crtc_state->state, cur_bridge); /* * If bus format negotiation is not supported by this bridge, let's * pass MEDIA_BUS_FMT_FIXED to the previous bridge in the chain and * hope that it can handle this situation gracefully (by providing * appropriate default values). */ if (!cur_bridge->funcs->atomic_get_input_bus_fmts) { if (cur_bridge != first_bridge) { ret = select_bus_fmt_recursive(first_bridge, prev_bridge, crtc_state, conn_state, MEDIA_BUS_FMT_FIXED); if (ret) return ret; } /* * Driver does not implement the atomic state hooks, but that's * fine, as long as it does not access the bridge state. */ if (cur_state) { cur_state->input_bus_cfg.format = MEDIA_BUS_FMT_FIXED; cur_state->output_bus_cfg.format = out_bus_fmt; } return 0; } /* * If the driver implements ->atomic_get_input_bus_fmts() it * should also implement the atomic state hooks. */ if (WARN_ON(!cur_state)) return -EINVAL; in_bus_fmts = cur_bridge->funcs->atomic_get_input_bus_fmts(cur_bridge, cur_state, crtc_state, conn_state, out_bus_fmt, &num_in_bus_fmts); if (!num_in_bus_fmts) return -ENOTSUPP; else if (!in_bus_fmts) return -ENOMEM; if (first_bridge == cur_bridge) { cur_state->input_bus_cfg.format = in_bus_fmts[0]; cur_state->output_bus_cfg.format = out_bus_fmt; kfree(in_bus_fmts); return 0; } for (i = 0; i < num_in_bus_fmts; i++) { ret = select_bus_fmt_recursive(first_bridge, prev_bridge, crtc_state, conn_state, in_bus_fmts[i]); if (ret != -ENOTSUPP) break; } if (!ret) { cur_state->input_bus_cfg.format = in_bus_fmts[i]; cur_state->output_bus_cfg.format = out_bus_fmt; } kfree(in_bus_fmts); return ret; } /* * This function is called by &drm_atomic_bridge_chain_check() just before * calling &drm_bridge_funcs.atomic_check() on all elements of the chain. * It performs bus format negotiation between bridge elements. The negotiation * happens in reverse order, starting from the last element in the chain up to * @bridge. * * Negotiation starts by retrieving supported output bus formats on the last * bridge element and testing them one by one. The test is recursive, meaning * that for each tested output format, the whole chain will be walked backward, * and each element will have to choose an input bus format that can be * transcoded to the requested output format. When a bridge element does not * support transcoding into a specific output format -ENOTSUPP is returned and * the next bridge element will have to try a different format. If none of the * combinations worked, -ENOTSUPP is returned and the atomic modeset will fail. * * This implementation is relying on * &drm_bridge_funcs.atomic_get_output_bus_fmts() and * &drm_bridge_funcs.atomic_get_input_bus_fmts() to gather supported * input/output formats. * * When &drm_bridge_funcs.atomic_get_output_bus_fmts() is not implemented by * the last element of the chain, &drm_atomic_bridge_chain_select_bus_fmts() * tries a single format: &drm_connector.display_info.bus_formats[0] if * available, MEDIA_BUS_FMT_FIXED otherwise. * * When &drm_bridge_funcs.atomic_get_input_bus_fmts() is not implemented, * &drm_atomic_bridge_chain_select_bus_fmts() skips the negotiation on the * bridge element that lacks this hook and asks the previous element in the * chain to try MEDIA_BUS_FMT_FIXED. It's up to bridge drivers to decide what * to do in that case (fail if they want to enforce bus format negotiation, or * provide a reasonable default if they need to support pipelines where not * all elements support bus format negotiation). */ static int drm_atomic_bridge_chain_select_bus_fmts(struct drm_bridge *bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state) { struct drm_connector *conn = conn_state->connector; struct drm_encoder *encoder = bridge->encoder; struct drm_bridge_state *last_bridge_state; unsigned int i, num_out_bus_fmts = 0; struct drm_bridge *last_bridge; u32 *out_bus_fmts; int ret = 0; last_bridge = list_last_entry(&encoder->bridge_chain, struct drm_bridge, chain_node); last_bridge_state = drm_atomic_get_new_bridge_state(crtc_state->state, last_bridge); if (last_bridge->funcs->atomic_get_output_bus_fmts) { const struct drm_bridge_funcs *funcs = last_bridge->funcs; /* * If the driver implements ->atomic_get_output_bus_fmts() it * should also implement the atomic state hooks. */ if (WARN_ON(!last_bridge_state)) return -EINVAL; out_bus_fmts = funcs->atomic_get_output_bus_fmts(last_bridge, last_bridge_state, crtc_state, conn_state, &num_out_bus_fmts); if (!num_out_bus_fmts) return -ENOTSUPP; else if (!out_bus_fmts) return -ENOMEM; } else { num_out_bus_fmts = 1; out_bus_fmts = kmalloc(sizeof(*out_bus_fmts), GFP_KERNEL); if (!out_bus_fmts) return -ENOMEM; if (conn->display_info.num_bus_formats && conn->display_info.bus_formats) out_bus_fmts[0] = conn->display_info.bus_formats[0]; else out_bus_fmts[0] = MEDIA_BUS_FMT_FIXED; } for (i = 0; i < num_out_bus_fmts; i++) { ret = select_bus_fmt_recursive(bridge, last_bridge, crtc_state, conn_state, out_bus_fmts[i]); if (ret != -ENOTSUPP) break; } kfree(out_bus_fmts); return ret; } static void drm_atomic_bridge_propagate_bus_flags(struct drm_bridge *bridge, struct drm_connector *conn, struct drm_atomic_state *state) { struct drm_bridge_state *bridge_state, *next_bridge_state; struct drm_bridge *next_bridge; u32 output_flags = 0; bridge_state = drm_atomic_get_new_bridge_state(state, bridge); /* No bridge state attached to this bridge => nothing to propagate. */ if (!bridge_state) return; next_bridge = drm_bridge_get_next_bridge(bridge); /* * Let's try to apply the most common case here, that is, propagate * display_info flags for the last bridge, and propagate the input * flags of the next bridge element to the output end of the current * bridge when the bridge is not the last one. * There are exceptions to this rule, like when signal inversion is * happening at the board level, but that's something drivers can deal * with from their &drm_bridge_funcs.atomic_check() implementation by * simply overriding the flags value we've set here. */ if (!next_bridge) { output_flags = conn->display_info.bus_flags; } else { next_bridge_state = drm_atomic_get_new_bridge_state(state, next_bridge); /* * No bridge state attached to the next bridge, just leave the * flags to 0. */ if (next_bridge_state) output_flags = next_bridge_state->input_bus_cfg.flags; } bridge_state->output_bus_cfg.flags = output_flags; /* * Propagate the output flags to the input end of the bridge. Again, it's * not necessarily what all bridges want, but that's what most of them * do, and by doing that by default we avoid forcing drivers to * duplicate the "dummy propagation" logic. */ bridge_state->input_bus_cfg.flags = output_flags; } /** * drm_atomic_bridge_chain_check() - Do an atomic check on the bridge chain * @bridge: bridge control structure * @crtc_state: new CRTC state * @conn_state: new connector state * * First trigger a bus format negotiation before calling * &drm_bridge_funcs.atomic_check() (falls back on * &drm_bridge_funcs.mode_fixup()) op for all the bridges in the encoder chain, * starting from the last bridge to the first. These are called before calling * &drm_encoder_helper_funcs.atomic_check() * * RETURNS: * 0 on success, a negative error code on failure */ int drm_atomic_bridge_chain_check(struct drm_bridge *bridge, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state) { struct drm_connector *conn = conn_state->connector; struct drm_encoder *encoder; struct drm_bridge *iter; int ret; if (!bridge) return 0; ret = drm_atomic_bridge_chain_select_bus_fmts(bridge, crtc_state, conn_state); if (ret) return ret; encoder = bridge->encoder; list_for_each_entry_reverse(iter, &encoder->bridge_chain, chain_node) { int ret; /* * Bus flags are propagated by default. If a bridge needs to * tweak the input bus flags for any reason, it should happen * in its &drm_bridge_funcs.atomic_check() implementation such * that preceding bridges in the chain can propagate the new * bus flags. */ drm_atomic_bridge_propagate_bus_flags(iter, conn, crtc_state->state); ret = drm_atomic_bridge_check(iter, crtc_state, conn_state); if (ret) return ret; if (iter == bridge) break; } return 0; } EXPORT_SYMBOL(drm_atomic_bridge_chain_check); /** * drm_bridge_detect - check if anything is attached to the bridge output * @bridge: bridge control structure * * If the bridge supports output detection, as reported by the * DRM_BRIDGE_OP_DETECT bridge ops flag, call &drm_bridge_funcs.detect for the * bridge and return the connection status. Otherwise return * connector_status_unknown. * * RETURNS: * The detection status on success, or connector_status_unknown if the bridge * doesn't support output detection. */ enum drm_connector_status drm_bridge_detect(struct drm_bridge *bridge) { if (!(bridge->ops & DRM_BRIDGE_OP_DETECT)) return connector_status_unknown; return bridge->funcs->detect(bridge); } EXPORT_SYMBOL_GPL(drm_bridge_detect); /** * drm_bridge_get_modes - fill all modes currently valid for the sink into the * @connector * @bridge: bridge control structure * @connector: the connector to fill with modes * * If the bridge supports output modes retrieval, as reported by the * DRM_BRIDGE_OP_MODES bridge ops flag, call &drm_bridge_funcs.get_modes to * fill the connector with all valid modes and return the number of modes * added. Otherwise return 0. * * RETURNS: * The number of modes added to the connector. */ int drm_bridge_get_modes(struct drm_bridge *bridge, struct drm_connector *connector) { if (!(bridge->ops & DRM_BRIDGE_OP_MODES)) return 0; return bridge->funcs->get_modes(bridge, connector); } EXPORT_SYMBOL_GPL(drm_bridge_get_modes); /** * drm_bridge_edid_read - read the EDID data of the connected display * @bridge: bridge control structure * @connector: the connector to read EDID for * * If the bridge supports output EDID retrieval, as reported by the * DRM_BRIDGE_OP_EDID bridge ops flag, call &drm_bridge_funcs.edid_read to get * the EDID and return it. Otherwise return NULL. * * RETURNS: * The retrieved EDID on success, or NULL otherwise. */ const struct drm_edid *drm_bridge_edid_read(struct drm_bridge *bridge, struct drm_connector *connector) { if (!(bridge->ops & DRM_BRIDGE_OP_EDID)) return NULL; return bridge->funcs->edid_read(bridge, connector); } EXPORT_SYMBOL_GPL(drm_bridge_edid_read); /** * drm_bridge_hpd_enable - enable hot plug detection for the bridge * @bridge: bridge control structure * @cb: hot-plug detection callback * @data: data to be passed to the hot-plug detection callback * * Call &drm_bridge_funcs.hpd_enable if implemented and register the given @cb * and @data as hot plug notification callback. From now on the @cb will be * called with @data when an output status change is detected by the bridge, * until hot plug notification gets disabled with drm_bridge_hpd_disable(). * * Hot plug detection is supported only if the DRM_BRIDGE_OP_HPD flag is set in * bridge->ops. This function shall not be called when the flag is not set. * * Only one hot plug detection callback can be registered at a time, it is an * error to call this function when hot plug detection is already enabled for * the bridge. */ void drm_bridge_hpd_enable(struct drm_bridge *bridge, void (*cb)(void *data, enum drm_connector_status status), void *data) { if (!(bridge->ops & DRM_BRIDGE_OP_HPD)) return; mutex_lock(&bridge->hpd_mutex); if (WARN(bridge->hpd_cb, "Hot plug detection already enabled\n")) goto unlock; bridge->hpd_cb = cb; bridge->hpd_data = data; if (bridge->funcs->hpd_enable) bridge->funcs->hpd_enable(bridge); unlock: mutex_unlock(&bridge->hpd_mutex); } EXPORT_SYMBOL_GPL(drm_bridge_hpd_enable); /** * drm_bridge_hpd_disable - disable hot plug detection for the bridge * @bridge: bridge control structure * * Call &drm_bridge_funcs.hpd_disable if implemented and unregister the hot * plug detection callback previously registered with drm_bridge_hpd_enable(). * Once this function returns the callback will not be called by the bridge * when an output status change occurs. * * Hot plug detection is supported only if the DRM_BRIDGE_OP_HPD flag is set in * bridge->ops. This function shall not be called when the flag is not set. */ void drm_bridge_hpd_disable(struct drm_bridge *bridge) { if (!(bridge->ops & DRM_BRIDGE_OP_HPD)) return; mutex_lock(&bridge->hpd_mutex); if (bridge->funcs->hpd_disable) bridge->funcs->hpd_disable(bridge); bridge->hpd_cb = NULL; bridge->hpd_data = NULL; mutex_unlock(&bridge->hpd_mutex); } EXPORT_SYMBOL_GPL(drm_bridge_hpd_disable); /** * drm_bridge_hpd_notify - notify hot plug detection events * @bridge: bridge control structure * @status: output connection status * * Bridge drivers shall call this function to report hot plug events when they * detect a change in the output status, when hot plug detection has been * enabled by drm_bridge_hpd_enable(). * * This function shall be called in a context that can sleep. */ void drm_bridge_hpd_notify(struct drm_bridge *bridge, enum drm_connector_status status) { mutex_lock(&bridge->hpd_mutex); if (bridge->hpd_cb) bridge->hpd_cb(bridge->hpd_data, status); mutex_unlock(&bridge->hpd_mutex); } EXPORT_SYMBOL_GPL(drm_bridge_hpd_notify); #ifdef CONFIG_OF /** * of_drm_find_bridge - find the bridge corresponding to the device node in * the global bridge list * * @np: device node * * RETURNS: * drm_bridge control struct on success, NULL on failure */ struct drm_bridge *of_drm_find_bridge(struct device_node *np) { struct drm_bridge *bridge; mutex_lock(&bridge_lock); list_for_each_entry(bridge, &bridge_list, list) { if (bridge->of_node == np) { mutex_unlock(&bridge_lock); return bridge; } } mutex_unlock(&bridge_lock); return NULL; } EXPORT_SYMBOL(of_drm_find_bridge); #endif MODULE_AUTHOR("Ajay Kumar <ajaykumar.rs@samsung.com>"); MODULE_DESCRIPTION("DRM bridge infrastructure"); MODULE_LICENSE("GPL and additional rights"); |
43 42 35 35 5 25 25 4 20 53 74 47 24 35 35 35 1 35 35 4 2 1 1 60 49 16 13 4 1 9 7 3 1 6 16 22 4 3 1 9 9 3 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 | // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * synth device handlers * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include "seq_oss_synth.h" #include "seq_oss_midi.h" #include "../seq_lock.h" #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> /* * constants */ #define SNDRV_SEQ_OSS_MAX_SYNTH_NAME 30 #define MAX_SYSEX_BUFLEN 128 /* * definition of synth info records */ /* sysex buffer */ struct seq_oss_synth_sysex { int len; int skip; unsigned char buf[MAX_SYSEX_BUFLEN]; }; /* synth info */ struct seq_oss_synth { int seq_device; /* for synth_info */ int synth_type; int synth_subtype; int nr_voices; char name[SNDRV_SEQ_OSS_MAX_SYNTH_NAME]; struct snd_seq_oss_callback oper; int opened; void *private_data; snd_use_lock_t use_lock; }; /* * device table */ static int max_synth_devs; static struct seq_oss_synth *synth_devs[SNDRV_SEQ_OSS_MAX_SYNTH_DEVS]; static struct seq_oss_synth midi_synth_dev = { .seq_device = -1, .synth_type = SYNTH_TYPE_MIDI, .synth_subtype = 0, .nr_voices = 16, .name = "MIDI", }; static DEFINE_SPINLOCK(register_lock); /* * prototypes */ static struct seq_oss_synth *get_synthdev(struct seq_oss_devinfo *dp, int dev); static void reset_channels(struct seq_oss_synthinfo *info); /* * global initialization */ void __init snd_seq_oss_synth_init(void) { snd_use_lock_init(&midi_synth_dev.use_lock); } /* * registration of the synth device */ int snd_seq_oss_synth_probe(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); int i; struct seq_oss_synth *rec; struct snd_seq_oss_reg *reg = SNDRV_SEQ_DEVICE_ARGPTR(dev); unsigned long flags; rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (!rec) return -ENOMEM; rec->seq_device = -1; rec->synth_type = reg->type; rec->synth_subtype = reg->subtype; rec->nr_voices = reg->nvoices; rec->oper = reg->oper; rec->private_data = reg->private_data; rec->opened = 0; snd_use_lock_init(&rec->use_lock); /* copy and truncate the name of synth device */ strscpy(rec->name, dev->name, sizeof(rec->name)); /* registration */ spin_lock_irqsave(®ister_lock, flags); for (i = 0; i < max_synth_devs; i++) { if (synth_devs[i] == NULL) break; } if (i >= max_synth_devs) { if (max_synth_devs >= SNDRV_SEQ_OSS_MAX_SYNTH_DEVS) { spin_unlock_irqrestore(®ister_lock, flags); pr_err("ALSA: seq_oss: no more synth slot\n"); kfree(rec); return -ENOMEM; } max_synth_devs++; } rec->seq_device = i; synth_devs[i] = rec; spin_unlock_irqrestore(®ister_lock, flags); dev->driver_data = rec; #ifdef SNDRV_OSS_INFO_DEV_SYNTH if (i < SNDRV_CARDS) snd_oss_info_register(SNDRV_OSS_INFO_DEV_SYNTH, i, rec->name); #endif return 0; } int snd_seq_oss_synth_remove(struct device *_dev) { struct snd_seq_device *dev = to_seq_dev(_dev); int index; struct seq_oss_synth *rec = dev->driver_data; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); for (index = 0; index < max_synth_devs; index++) { if (synth_devs[index] == rec) break; } if (index >= max_synth_devs) { spin_unlock_irqrestore(®ister_lock, flags); pr_err("ALSA: seq_oss: can't unregister synth\n"); return -EINVAL; } synth_devs[index] = NULL; if (index == max_synth_devs - 1) { for (index--; index >= 0; index--) { if (synth_devs[index]) break; } max_synth_devs = index + 1; } spin_unlock_irqrestore(®ister_lock, flags); #ifdef SNDRV_OSS_INFO_DEV_SYNTH if (rec->seq_device < SNDRV_CARDS) snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_SYNTH, rec->seq_device); #endif snd_use_lock_sync(&rec->use_lock); kfree(rec); return 0; } /* */ static struct seq_oss_synth * get_sdev(int dev) { struct seq_oss_synth *rec; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); rec = synth_devs[dev]; if (rec) snd_use_lock_use(&rec->use_lock); spin_unlock_irqrestore(®ister_lock, flags); return rec; } /* * set up synth tables */ void snd_seq_oss_synth_setup(struct seq_oss_devinfo *dp) { int i; struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; dp->max_synthdev = max_synth_devs; dp->synth_opened = 0; memset(dp->synths, 0, sizeof(dp->synths)); for (i = 0; i < dp->max_synthdev; i++) { rec = get_sdev(i); if (rec == NULL) continue; if (rec->oper.open == NULL || rec->oper.close == NULL) { snd_use_lock_free(&rec->use_lock); continue; } info = &dp->synths[i]; info->arg.app_index = dp->port; info->arg.file_mode = dp->file_mode; info->arg.seq_mode = dp->seq_mode; if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_SYNTH) info->arg.event_passing = SNDRV_SEQ_OSS_PROCESS_EVENTS; else info->arg.event_passing = SNDRV_SEQ_OSS_PASS_EVENTS; info->opened = 0; if (!try_module_get(rec->oper.owner)) { snd_use_lock_free(&rec->use_lock); continue; } if (rec->oper.open(&info->arg, rec->private_data) < 0) { module_put(rec->oper.owner); snd_use_lock_free(&rec->use_lock); continue; } info->nr_voices = rec->nr_voices; if (info->nr_voices > 0) { info->ch = kcalloc(info->nr_voices, sizeof(struct seq_oss_chinfo), GFP_KERNEL); if (!info->ch) { rec->oper.close(&info->arg); module_put(rec->oper.owner); snd_use_lock_free(&rec->use_lock); continue; } reset_channels(info); } info->opened++; rec->opened++; dp->synth_opened++; snd_use_lock_free(&rec->use_lock); } } /* * set up synth tables for MIDI emulation - /dev/music mode only */ void snd_seq_oss_synth_setup_midi(struct seq_oss_devinfo *dp) { int i; if (dp->max_synthdev >= SNDRV_SEQ_OSS_MAX_SYNTH_DEVS) return; for (i = 0; i < dp->max_mididev; i++) { struct seq_oss_synthinfo *info; info = &dp->synths[dp->max_synthdev]; if (snd_seq_oss_midi_open(dp, i, dp->file_mode) < 0) continue; info->arg.app_index = dp->port; info->arg.file_mode = dp->file_mode; info->arg.seq_mode = dp->seq_mode; info->arg.private_data = info; info->is_midi = 1; info->midi_mapped = i; info->arg.event_passing = SNDRV_SEQ_OSS_PASS_EVENTS; snd_seq_oss_midi_get_addr(dp, i, &info->arg.addr); info->opened = 1; midi_synth_dev.opened++; dp->max_synthdev++; if (dp->max_synthdev >= SNDRV_SEQ_OSS_MAX_SYNTH_DEVS) break; } } /* * clean up synth tables */ void snd_seq_oss_synth_cleanup(struct seq_oss_devinfo *dp) { int i; struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; if (snd_BUG_ON(dp->max_synthdev > SNDRV_SEQ_OSS_MAX_SYNTH_DEVS)) return; for (i = 0; i < dp->max_synthdev; i++) { info = &dp->synths[i]; if (! info->opened) continue; if (info->is_midi) { if (midi_synth_dev.opened > 0) { snd_seq_oss_midi_close(dp, info->midi_mapped); midi_synth_dev.opened--; } } else { rec = get_sdev(i); if (rec == NULL) continue; if (rec->opened > 0) { rec->oper.close(&info->arg); module_put(rec->oper.owner); rec->opened = 0; } snd_use_lock_free(&rec->use_lock); } kfree(info->sysex); info->sysex = NULL; kfree(info->ch); info->ch = NULL; } dp->synth_opened = 0; dp->max_synthdev = 0; } static struct seq_oss_synthinfo * get_synthinfo_nospec(struct seq_oss_devinfo *dp, int dev) { if (dev < 0 || dev >= dp->max_synthdev) return NULL; dev = array_index_nospec(dev, SNDRV_SEQ_OSS_MAX_SYNTH_DEVS); return &dp->synths[dev]; } /* * return synth device information pointer */ static struct seq_oss_synth * get_synthdev(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info = get_synthinfo_nospec(dp, dev); if (!info) return NULL; if (!info->opened) return NULL; if (info->is_midi) { rec = &midi_synth_dev; snd_use_lock_use(&rec->use_lock); } else { rec = get_sdev(dev); if (!rec) return NULL; } if (! rec->opened) { snd_use_lock_free(&rec->use_lock); return NULL; } return rec; } /* * reset note and velocity on each channel. */ static void reset_channels(struct seq_oss_synthinfo *info) { int i; if (info->ch == NULL || ! info->nr_voices) return; for (i = 0; i < info->nr_voices; i++) { info->ch[i].note = -1; info->ch[i].vel = 0; } } /* * reset synth device: * call reset callback. if no callback is defined, send a heartbeat * event to the corresponding port. */ void snd_seq_oss_synth_reset(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; info = get_synthinfo_nospec(dp, dev); if (!info || !info->opened) return; if (info->sysex) info->sysex->len = 0; /* reset sysex */ reset_channels(info); if (info->is_midi) { if (midi_synth_dev.opened <= 0) return; snd_seq_oss_midi_reset(dp, info->midi_mapped); /* reopen the device */ snd_seq_oss_midi_close(dp, dev); if (snd_seq_oss_midi_open(dp, info->midi_mapped, dp->file_mode) < 0) { midi_synth_dev.opened--; info->opened = 0; kfree(info->sysex); info->sysex = NULL; kfree(info->ch); info->ch = NULL; } return; } rec = get_sdev(dev); if (rec == NULL) return; if (rec->oper.reset) { rec->oper.reset(&info->arg); } else { struct snd_seq_event ev; memset(&ev, 0, sizeof(ev)); snd_seq_oss_fill_addr(dp, &ev, info->arg.addr.client, info->arg.addr.port); ev.type = SNDRV_SEQ_EVENT_RESET; snd_seq_oss_dispatch(dp, &ev, 0, 0); } snd_use_lock_free(&rec->use_lock); } /* * load a patch record: * call load_patch callback function */ int snd_seq_oss_synth_load_patch(struct seq_oss_devinfo *dp, int dev, int fmt, const char __user *buf, int p, int c) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; int rc; info = get_synthinfo_nospec(dp, dev); if (!info) return -ENXIO; if (info->is_midi) return 0; rec = get_synthdev(dp, dev); if (!rec) return -ENXIO; if (rec->oper.load_patch == NULL) rc = -ENXIO; else rc = rec->oper.load_patch(&info->arg, fmt, buf, p, c); snd_use_lock_free(&rec->use_lock); return rc; } /* * check if the device is valid synth device and return the synth info */ struct seq_oss_synthinfo * snd_seq_oss_synth_info(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_synth *rec; rec = get_synthdev(dp, dev); if (rec) { snd_use_lock_free(&rec->use_lock); return get_synthinfo_nospec(dp, dev); } return NULL; } /* * receive OSS 6 byte sysex packet: * the full sysex message will be sent if it reaches to the end of data * (0xff). */ int snd_seq_oss_synth_sysex(struct seq_oss_devinfo *dp, int dev, unsigned char *buf, struct snd_seq_event *ev) { int i, send; unsigned char *dest; struct seq_oss_synth_sysex *sysex; struct seq_oss_synthinfo *info; info = snd_seq_oss_synth_info(dp, dev); if (!info) return -ENXIO; sysex = info->sysex; if (sysex == NULL) { sysex = kzalloc(sizeof(*sysex), GFP_KERNEL); if (sysex == NULL) return -ENOMEM; info->sysex = sysex; } send = 0; dest = sysex->buf + sysex->len; /* copy 6 byte packet to the buffer */ for (i = 0; i < 6; i++) { if (buf[i] == 0xff) { send = 1; break; } dest[i] = buf[i]; sysex->len++; if (sysex->len >= MAX_SYSEX_BUFLEN) { sysex->len = 0; sysex->skip = 1; break; } } if (sysex->len && send) { if (sysex->skip) { sysex->skip = 0; sysex->len = 0; return -EINVAL; /* skip */ } /* copy the data to event record and send it */ ev->flags = SNDRV_SEQ_EVENT_LENGTH_VARIABLE; if (snd_seq_oss_synth_addr(dp, dev, ev)) return -EINVAL; ev->data.ext.len = sysex->len; ev->data.ext.ptr = sysex->buf; sysex->len = 0; return 0; } return -EINVAL; /* skip */ } /* * fill the event source/destination addresses */ int snd_seq_oss_synth_addr(struct seq_oss_devinfo *dp, int dev, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info = snd_seq_oss_synth_info(dp, dev); if (!info) return -EINVAL; snd_seq_oss_fill_addr(dp, ev, info->arg.addr.client, info->arg.addr.port); return 0; } /* * OSS compatible ioctl */ int snd_seq_oss_synth_ioctl(struct seq_oss_devinfo *dp, int dev, unsigned int cmd, unsigned long addr) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info; int rc; info = get_synthinfo_nospec(dp, dev); if (!info || info->is_midi) return -ENXIO; rec = get_synthdev(dp, dev); if (!rec) return -ENXIO; if (rec->oper.ioctl == NULL) rc = -ENXIO; else rc = rec->oper.ioctl(&info->arg, cmd, addr); snd_use_lock_free(&rec->use_lock); return rc; } /* * send OSS raw events - SEQ_PRIVATE and SEQ_VOLUME */ int snd_seq_oss_synth_raw_event(struct seq_oss_devinfo *dp, int dev, unsigned char *data, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info; info = snd_seq_oss_synth_info(dp, dev); if (!info || info->is_midi) return -ENXIO; ev->type = SNDRV_SEQ_EVENT_OSS; memcpy(ev->data.raw8.d, data, 8); return snd_seq_oss_synth_addr(dp, dev, ev); } /* * create OSS compatible synth_info record */ int snd_seq_oss_synth_make_info(struct seq_oss_devinfo *dp, int dev, struct synth_info *inf) { struct seq_oss_synth *rec; struct seq_oss_synthinfo *info = get_synthinfo_nospec(dp, dev); if (!info) return -ENXIO; if (info->is_midi) { struct midi_info minf; if (snd_seq_oss_midi_make_info(dp, info->midi_mapped, &minf)) return -ENXIO; inf->synth_type = SYNTH_TYPE_MIDI; inf->synth_subtype = 0; inf->nr_voices = 16; inf->device = dev; strscpy(inf->name, minf.name, sizeof(inf->name)); } else { rec = get_synthdev(dp, dev); if (!rec) return -ENXIO; inf->synth_type = rec->synth_type; inf->synth_subtype = rec->synth_subtype; inf->nr_voices = rec->nr_voices; inf->device = dev; strscpy(inf->name, rec->name, sizeof(inf->name)); snd_use_lock_free(&rec->use_lock); } return 0; } #ifdef CONFIG_SND_PROC_FS /* * proc interface */ void snd_seq_oss_synth_info_read(struct snd_info_buffer *buf) { int i; struct seq_oss_synth *rec; snd_iprintf(buf, "\nNumber of synth devices: %d\n", max_synth_devs); for (i = 0; i < max_synth_devs; i++) { snd_iprintf(buf, "\nsynth %d: ", i); rec = get_sdev(i); if (rec == NULL) { snd_iprintf(buf, "*empty*\n"); continue; } snd_iprintf(buf, "[%s]\n", rec->name); snd_iprintf(buf, " type 0x%x : subtype 0x%x : voices %d\n", rec->synth_type, rec->synth_subtype, rec->nr_voices); snd_iprintf(buf, " capabilities : ioctl %s / load_patch %s\n", enabled_str((long)rec->oper.ioctl), enabled_str((long)rec->oper.load_patch)); snd_use_lock_free(&rec->use_lock); } } #endif /* CONFIG_SND_PROC_FS */ |
33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef _CRYPTO_BLAKE2B_H #define _CRYPTO_BLAKE2B_H #include <linux/bug.h> #include <linux/types.h> #include <linux/string.h> enum blake2b_lengths { BLAKE2B_BLOCK_SIZE = 128, BLAKE2B_HASH_SIZE = 64, BLAKE2B_KEY_SIZE = 64, BLAKE2B_160_HASH_SIZE = 20, BLAKE2B_256_HASH_SIZE = 32, BLAKE2B_384_HASH_SIZE = 48, BLAKE2B_512_HASH_SIZE = 64, }; struct blake2b_state { /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */ u64 h[8]; u64 t[2]; u64 f[2]; u8 buf[BLAKE2B_BLOCK_SIZE]; unsigned int buflen; unsigned int outlen; }; enum blake2b_iv { BLAKE2B_IV0 = 0x6A09E667F3BCC908ULL, BLAKE2B_IV1 = 0xBB67AE8584CAA73BULL, BLAKE2B_IV2 = 0x3C6EF372FE94F82BULL, BLAKE2B_IV3 = 0xA54FF53A5F1D36F1ULL, BLAKE2B_IV4 = 0x510E527FADE682D1ULL, BLAKE2B_IV5 = 0x9B05688C2B3E6C1FULL, BLAKE2B_IV6 = 0x1F83D9ABFB41BD6BULL, BLAKE2B_IV7 = 0x5BE0CD19137E2179ULL, }; static inline void __blake2b_init(struct blake2b_state *state, size_t outlen, const void *key, size_t keylen) { state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen); state->h[1] = BLAKE2B_IV1; state->h[2] = BLAKE2B_IV2; state->h[3] = BLAKE2B_IV3; state->h[4] = BLAKE2B_IV4; state->h[5] = BLAKE2B_IV5; state->h[6] = BLAKE2B_IV6; state->h[7] = BLAKE2B_IV7; state->t[0] = 0; state->t[1] = 0; state->f[0] = 0; state->f[1] = 0; state->buflen = 0; state->outlen = outlen; if (keylen) { memcpy(state->buf, key, keylen); memset(&state->buf[keylen], 0, BLAKE2B_BLOCK_SIZE - keylen); state->buflen = BLAKE2B_BLOCK_SIZE; } } #endif /* _CRYPTO_BLAKE2B_H */ |
472 15 2 2 7 1132 6 2 2 2 1132 1129 2 1129 6 6 6 1135 1130 1137 1133 33 1133 1136 1130 1131 1135 1133 1136 1134 14 471 1135 473 15 473 472 474 472 474 15 473 472 472 474 1132 472 1136 5 1134 474 474 1135 7 1126 1127 1133 31 1133 1134 1133 1137 1130 1135 1132 7 1127 1135 1128 6 6 6 6 6 5 5 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002 Andi Kleen, SuSE Labs. * Thanks to Ben LaHaise for precious feedback. */ #include <linux/highmem.h> #include <linux/memblock.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> #include <linux/gfp.h> #include <linux/pci.h> #include <linux/vmalloc.h> #include <linux/libnvdimm.h> #include <linux/vmstat.h> #include <linux/kernel.h> #include <linux/cc_platform.h> #include <linux/set_memory.h> #include <linux/memregion.h> #include <asm/e820/api.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/setup.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/memtype.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include "../mm_internal.h" /* * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { unsigned long *vaddr; pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; unsigned long numpages; unsigned long curpage; unsigned long pfn; unsigned int flags; unsigned int force_split : 1, force_static_prot : 1, force_flush_all : 1; struct page **pages; }; enum cpa_warn { CPA_CONFLICT, CPA_PROTECT, CPA_DETECT, }; static const int cpa_warn_level = CPA_PROTECT; /* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb * entries change the page attribute in parallel to some other cpu * splitting a large page entry along with changing the attribute. */ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { return __pgprot(cachemode2protval(pcm)); } #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; void update_page_count(int level, unsigned long pages) { /* Protect against CPA */ spin_lock(&pgd_lock); direct_pages_count[level] += pages; spin_unlock(&pgd_lock); } static void split_page_count(int level) { if (direct_pages_count[level] == 0) return; direct_pages_count[level]--; if (system_state == SYSTEM_RUNNING) { if (level == PG_LEVEL_2M) count_vm_event(DIRECT_MAP_LEVEL2_SPLIT); else if (level == PG_LEVEL_1G) count_vm_event(DIRECT_MAP_LEVEL3_SPLIT); } direct_pages_count[level - 1] += PTRS_PER_PTE; } void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", direct_pages_count[PG_LEVEL_4K] << 2); #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) seq_printf(m, "DirectMap2M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 11); #else seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif if (direct_gbpages) seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); } #else static inline void split_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS static unsigned long cpa_1g_checked; static unsigned long cpa_1g_sameprot; static unsigned long cpa_1g_preserved; static unsigned long cpa_2m_checked; static unsigned long cpa_2m_sameprot; static unsigned long cpa_2m_preserved; static unsigned long cpa_4k_install; static inline void cpa_inc_1g_checked(void) { cpa_1g_checked++; } static inline void cpa_inc_2m_checked(void) { cpa_2m_checked++; } static inline void cpa_inc_4k_install(void) { data_race(cpa_4k_install++); } static inline void cpa_inc_lp_sameprot(int level) { if (level == PG_LEVEL_1G) cpa_1g_sameprot++; else cpa_2m_sameprot++; } static inline void cpa_inc_lp_preserved(int level) { if (level == PG_LEVEL_1G) cpa_1g_preserved++; else cpa_2m_preserved++; } static int cpastats_show(struct seq_file *m, void *p) { seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); return 0; } static int cpastats_open(struct inode *inode, struct file *file) { return single_open(file, cpastats_show, NULL); } static const struct file_operations cpastats_fops = { .open = cpastats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __init cpa_stats_init(void) { debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, &cpastats_fops); return 0; } late_initcall(cpa_stats_init); #else static inline void cpa_inc_1g_checked(void) { } static inline void cpa_inc_2m_checked(void) { } static inline void cpa_inc_4k_install(void) { } static inline void cpa_inc_lp_sameprot(int level) { } static inline void cpa_inc_lp_preserved(int level) { } #endif static inline int within(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr < end; } static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } #ifdef CONFIG_X86_64 /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): * * 1. The kernel direct map (0xffff880000000000) * 2. The "high kernel map" (0xffffffff81000000) * * We actually execute out of #2. If we get the address of a kernel symbol, it * points to #2, but almost all physical-to-virtual translations point to #1. * * This is so that we can have both a directmap of all physical memory *and* * take full advantage of the limited (s32) immediate addressing range (2G) * of x86_64. * * See Documentation/arch/x86/x86_64/mm.rst for more detail. */ static inline unsigned long highmap_start_pfn(void) { return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { /* Do not reference physical address outside the kernel. */ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* * Kernel text has an alias mapping at a high address, known * here as "highmap". */ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); } #else static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* There is no highmap on 32-bit */ return false; } #endif /* * See set_mce_nospec(). * * Machine check recovery code needs to change cache mode of poisoned pages to * UC to avoid speculative access logging another error. But passing the * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a * speculative access. So we cheat and flip the top bit of the address. This * works fine for the code that updates the page tables. But at the end of the * process we need to flush the TLB and cache and the non-canonical address * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. * * But in the common case we already have a canonical address. This code * will fix the top bit if needed and is a no-op otherwise. */ static inline unsigned long fix_addr(unsigned long addr) { #ifdef CONFIG_X86_64 return (long)(addr << 1) >> 1; #else return addr; #endif } static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) { if (cpa->flags & CPA_PAGES_ARRAY) { struct page *page = cpa->pages[idx]; if (unlikely(PageHighMem(page))) return 0; return (unsigned long)page_address(page); } if (cpa->flags & CPA_ARRAY) return cpa->vaddr[idx]; return *cpa->vaddr + idx * PAGE_SIZE; } /* * Flushing functions */ static void clflush_cache_range_opt(void *vaddr, unsigned int size) { const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); void *vend = vaddr + size; if (p >= vend) return; for (; p < vend; p += clflush_size) clflushopt(p); } /** * clflush_cache_range - flush a cache range with clflush * @vaddr: virtual start address * @size: number of bytes to flush * * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or * SFENCE to avoid ordering issues. */ void clflush_cache_range(void *vaddr, unsigned int size) { mb(); clflush_cache_range_opt(vaddr, size); mb(); } EXPORT_SYMBOL_GPL(clflush_cache_range); #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_invalidate_pmem(void *addr, size_t size) { clflush_cache_range(addr, size); } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION bool cpu_cache_has_invalidate_memregion(void) { return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); int cpu_cache_invalidate_memregion(int res_desc) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; wbinvd_on_all_cpus(); return 0; } EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM"); #endif static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; /* * Flush all to work around Errata in early athlons regarding * large page flushing. */ __flush_tlb_all(); if (cache && boot_cpu_data.x86 >= 4) wbinvd(); } static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) cache, 1); } static void __cpa_flush_tlb(void *data) { struct cpa_data *cpa = data; unsigned int i; for (i = 0; i < cpa->numpages; i++) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static void cpa_flush(struct cpa_data *data, int cache) { struct cpa_data *cpa = data; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) flush_tlb_all(); else on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) return; mb(); for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; pte_t *pte = lookup_address(addr, &level); /* * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, unsigned long r2_start, unsigned long r2_end) { return (r1_start <= r2_end && r1_end >= r2_start) || (r2_start <= r1_end && r2_end >= r1_start); } #ifdef CONFIG_PCI_BIOS /* * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS * based config access (CONFIG_PCI_GOBIOS) support. */ #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) return _PAGE_NX; return 0; } #else static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { return 0; } #endif /* * The .rodata section needs to be read-only. Using the pfn catches all * aliases. This also includes __ro_after_init, so do not enforce until * kernel_set_to_readonly is true. */ static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) { unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); /* * Note: __end_rodata is at page aligned and not inclusive, so * subtract 1 to get the last enforced PFN in the rodata area. */ epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) return _PAGE_RW; return 0; } /* * Protect kernel text against becoming non executable by forbidding * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) * out of which the kernel actually executes. Do not protect the low * mapping. * * This does not cover __inittext since that is gone after boot. */ static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)_etext - 1; unsigned long t_start = (unsigned long)_text; if (overlaps(start, end, t_start, t_end)) return _PAGE_NX; return 0; } #if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections * will be always read-only. For the kernel identity mappings covering the * holes caused by this alignment can be anything that user asks. * * This will preserve the large page mappings for kernel text/data at no * extra cost. */ static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; unsigned long t_start = (unsigned long)_text; unsigned int level; if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) return 0; /* * Don't enforce the !RW mapping for the kernel text mapping, if * the current mapping is already using small page mapping. No * need to work hard to preserve large page mappings in this case. * * This also fixes the Linux Xen paravirt guest boot failure caused * by unexpected read-only mappings for kernel identity * mappings. In this paravirt guest case, the kernel text mapping * and the kernel identity mapping share the same page-table pages, * so the protections for kernel text and identity mappings have to * be the same. */ if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) return _PAGE_RW; return 0; } #else static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { return 0; } #endif static inline bool conflicts(pgprot_t prot, pgprotval_t val) { return (pgprot_val(prot) & ~val) != pgprot_val(prot); } static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, unsigned long start, unsigned long end, unsigned long pfn, const char *txt) { static const char *lvltxt[] = { [CPA_CONFLICT] = "conflict", [CPA_PROTECT] = "protect", [CPA_DETECT] = "detect", }; if (warnlvl > cpa_warn_level || !conflicts(prot, val)) return; pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), (unsigned long long)val); } /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits. */ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, unsigned long pfn, unsigned long npg, unsigned long lpsize, int warnlvl) { pgprotval_t forbidden, res; unsigned long end; /* * There is no point in checking RW/NX conflicts when the requested * mapping is setting the page !PRESENT. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) return prot; /* Operate on the virtual address */ end = start + npg * PAGE_SIZE - 1; res = protect_kernel_text(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); forbidden = res; /* * Special case to preserve a large page. If the change spawns the * full large page mapping then there is no point to split it * up. Happens with ftrace and is going to be removed once ftrace * switched to text_poke(). */ if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { res = protect_kernel_text_ro(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); forbidden |= res; } /* Check the PFN directly */ res = protect_pci_bios(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); forbidden |= res; res = protect_rodata(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); forbidden |= res; return __pgprot(pgprot_val(prot) & ~forbidden); } /* * Validate strict W^X semantics. */ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, unsigned long pfn, unsigned long npg, bool nx, bool rw) { unsigned long end; /* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable * detection and enforcement there. */ if (IS_ENABLED(CONFIG_X86_32)) return new; /* Only verify when NX is supported: */ if (!(__supported_pte_mask & _PAGE_NX)) return new; if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) return new; if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) return new; /* Non-leaf translation entries can disable writing or execution. */ if (!rw || nx) return new; end = start + npg * PAGE_SIZE - 1; WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n", (unsigned long long)pgprot_val(old), (unsigned long long)pgprot_val(new), start, end, pfn); /* * For now, allow all permission change attempts by returning the * attempted permissions. This can 'return old' to actively * refuse the permission change at a later time. */ return new; } /* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry (or NULL if the entry does not exist), * the level of the entry, and the effective NX and RW bits of all * page table levels. */ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, unsigned int *level, bool *nx, bool *rw) { p4d_t *p4d; pud_t *pud; pmd_t *pmd; *level = PG_LEVEL_256T; *nx = false; *rw = true; if (pgd_none(*pgd)) return NULL; *level = PG_LEVEL_512G; *nx |= pgd_flags(*pgd) & _PAGE_NX; *rw &= pgd_flags(*pgd) & _PAGE_RW; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d) || !p4d_present(*p4d)) return (pte_t *)p4d; *level = PG_LEVEL_1G; *nx |= p4d_flags(*p4d) & _PAGE_NX; *rw &= p4d_flags(*p4d) & _PAGE_RW; pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud) || !pud_present(*pud)) return (pte_t *)pud; *level = PG_LEVEL_2M; *nx |= pud_flags(*pud) & _PAGE_NX; *rw &= pud_flags(*pud) & _PAGE_RW; pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; *nx |= pmd_flags(*pmd) & _PAGE_NX; *rw &= pmd_flags(*pmd) & _PAGE_RW; return pte_offset_kernel(pmd, address); } /* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry and the level of the mapping. */ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { bool nx, rw; return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw); } /* * Lookup the page table entry for a virtual address. Return a pointer * to the entry and the level of the mapping. * * Note: the function returns p4d, pud or pmd either when the entry is marked * large or when the present bit is not set. Otherwise it returns NULL. */ pte_t *lookup_address(unsigned long address, unsigned int *level) { return lookup_address_in_pgd(pgd_offset_k(address), address, level); } EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level, bool *nx, bool *rw) { pgd_t *pgd; if (!cpa->pgd) pgd = pgd_offset_k(address); else pgd = cpa->pgd + pgd_index(address); return lookup_address_in_pgd_attr(pgd, address, level, nx, rw); } /* * Lookup the PMD entry for a virtual address. Return a pointer to the entry * or NULL if not present. */ pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud)) return NULL; return pmd_offset(pud, address); } /* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * * Note that as long as the PTEs are well-formed with correct PFNs, this * works without checking the PRESENT bit in the leaf PTE. This is unlike * the similar vmalloc_to_page() and derivatives. Callers may depend on * this behavior. * * This could be optimized, but it is only used in paths that are not perf * sensitive, and keeping it unoptimized should increase the testing coverage * for the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; phys_addr_t phys_addr; unsigned long offset; enum pg_level level; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); /* * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t * before being left-shifted PAGE_SHIFT bits -- this trick is to * make 32-PAE kernel work correctly. */ switch (level) { case PG_LEVEL_1G: phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PUD_MASK; break; case PG_LEVEL_2M: phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PMD_MASK; break; default: phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; offset = virt_addr & ~PAGE_MASK; } return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); /* * Set the new pmd in all the pgds we know about: */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 if (!SHARED_KERNEL_PMD) { struct page *page; list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } } #endif } static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) { /* * _PAGE_GLOBAL means "global page" for present PTEs. * But, it is also used to indicate _PAGE_PROTNONE * for non-present PTEs. * * This ensures that a _PAGE_GLOBAL PTE going from * present to non-present is not confused as * _PAGE_PROTNONE. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) pgprot_val(prot) &= ~_PAGE_GLOBAL; return prot; } static int __should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, *tmp; enum pg_level level; bool nx, rw; /* * Check for races, another CPU might have split this page * up already: */ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) return 1; switch (level) { case PG_LEVEL_2M: old_prot = pmd_pgprot(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte); cpa_inc_2m_checked(); break; case PG_LEVEL_1G: old_prot = pud_pgprot(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte); cpa_inc_1g_checked(); break; default: return -EINVAL; } psize = page_level_size(level); pmask = page_level_mask(level); /* * Calculate the number of pages, which fit into this large * page starting at address: */ lpaddr = (address + psize) & pmask; numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages) cpa->numpages = numpages; /* * We are safe now. Check whether the new pgprot is the same: * Convert protection attributes to 4k-format, as cpa->mask* are set * up accordingly. */ /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* * req_prot is in format of 4k pages. It must be converted to large * page format: the caching mode includes the PAT bit located at * different bit positions in the two formats. */ req_prot = pgprot_4k_2_large(req_prot); req_prot = pgprot_clear_protnone_bits(req_prot); if (pgprot_val(req_prot) & _PAGE_PRESENT) pgprot_val(req_prot) |= _PAGE_PSE; /* * old_pfn points to the large page base pfn. So we need to add the * offset of the virtual address: */ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; /* * Calculate the large page base address and the number of 4K pages * in the large page */ lpaddr = address & pmask; numpages = psize >> PAGE_SHIFT; /* * Sanity check that the existing mapping is correct versus the static * protections. static_protections() guards against !PRESENT, so no * extra conditional required here. */ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, psize, CPA_CONFLICT); if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { /* * Split the large page and tell the split code to * enforce static protections. */ cpa->force_static_prot = 1; return 1; } /* * Optimization: If the requested pgprot is the same as the current * pgprot, then the large page can be preserved and no updates are * required independent of alignment and length of the requested * range. The above already established that the current pgprot is * correct, which in consequence makes the requested pgprot correct * as well if it is the same. The static protection scan below will * not come to a different conclusion. */ if (pgprot_val(req_prot) == pgprot_val(old_prot)) { cpa_inc_lp_sameprot(level); return 0; } /* * If the requested range does not cover the full page, split it up */ if (address != lpaddr || cpa->numpages != numpages) return 1; /* * Check whether the requested pgprot is conflicting with a static * protection requirement in the large page. */ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, psize, CPA_DETECT); new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages, nx, rw); /* * If there is a conflict, split the large page. * * There used to be a 4k wise evaluation trying really hard to * preserve the large pages, but experimentation has shown, that this * does not help at all. There might be corner cases which would * preserve one large page occasionally, but it's really not worth the * extra code and cycles for the common case. */ if (pgprot_val(req_prot) != pgprot_val(new_prot)) return 1; /* All checks passed. Update the large page mapping. */ new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; cpa_inc_lp_preserved(level); return 0; } static int should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { int do_split; if (cpa->force_split) return 1; spin_lock(&pgd_lock); do_split = __should_split_large_page(kpte, address, cpa); spin_unlock(&pgd_lock); return do_split; } static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, pgprot_t ref_prot, unsigned long address, unsigned long size) { unsigned int npg = PFN_DOWN(size); pgprot_t prot; /* * If should_split_large_page() discovered an inconsistent mapping, * remove the invalid protection in the split mapping. */ if (!cpa->force_static_prot) goto set; /* Hand in lpsize = 0 to enforce the protection mechanism */ prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT); if (pgprot_val(prot) == pgprot_val(ref_prot)) goto set; /* * If this is splitting a PMD, fix it up. PUD splits cannot be * fixed trivially as that would require to rescan the newly * installed PMD mappings after returning from split_large_page() * so an eventual further split can allocate the necessary PTE * pages. Warn for now and revisit it in case this actually * happens. */ if (size == PAGE_SIZE) ref_prot = prot; else pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); set: set_pte(pte, pfn_pte(pfn, ref_prot)); } static int __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; pte_t *pbase = (pte_t *)page_address(base); unsigned int i, level; pgprot_t ref_prot; bool nx, rw; pte_t *tmp; spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: */ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); switch (level) { case PG_LEVEL_2M: ref_prot = pmd_pgprot(*(pmd_t *)kpte); /* * Clear PSE (aka _PAGE_PAT) and move * PAT bit to correct position. */ ref_prot = pgprot_large_2_4k(ref_prot); ref_pfn = pmd_pfn(*(pmd_t *)kpte); lpaddr = address & PMD_MASK; lpinc = PAGE_SIZE; break; case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_SIZE >> PAGE_SHIFT; lpaddr = address & PUD_MASK; lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present() will return true even on a non * present pmd. */ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; break; default: spin_unlock(&pgd_lock); return 1; } ref_prot = pgprot_clear_protnone_bits(ref_prot); /* * Get the target pfn from the original entry: */ pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); if (pfn_range_is_mapped(pfn, pfn + 1)) split_page_count(level); } /* * Install the new, split up pagetable. * * We use the standard kernel pagetable protections for the new * pagetable protections, the actual ptes set above control the * primary protection behavior: */ __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* * Do a global flush tlb after splitting the large page * and before we do the actual change page attribute in the PTE. * * Without this, we violate the TLB application note, that says: * "The TLBs may contain both ordinary and large-page * translations for a 4-KByte range of linear addresses. This * may occur if software modifies the paging structures so that * the page size used for the address range changes. If the two * translations differ with respect to page frame or attributes * (e.g., permissions), processor behavior is undefined and may * be implementation-specific." * * We do this global tlb flush inside the cpa_lock, so that we * don't allow any other cpu, with stale tlb entries change the * page attribute in parallel, that also falls into the * just split large page entry. */ flush_tlb_all(); spin_unlock(&pgd_lock); return 0; } static int split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address) { struct page *base; if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) return -ENOMEM; if (__split_large_page(cpa, kpte, address, base)) __free_page(base); return 0; } static bool try_to_free_pte_page(pte_t *pte) { int i; for (i = 0; i < PTRS_PER_PTE; i++) if (!pte_none(pte[i])) return false; free_page((unsigned long)pte); return true; } static bool try_to_free_pmd_page(pmd_t *pmd) { int i; for (i = 0; i < PTRS_PER_PMD; i++) if (!pmd_none(pmd[i])) return false; free_page((unsigned long)pmd); return true; } static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); while (start < end) { set_pte(pte, __pte(0)); start += PAGE_SIZE; pte++; } if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { pmd_clear(pmd); return true; } return false; } static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, unsigned long start, unsigned long end) { if (unmap_pte_range(pmd, start, end)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) { pmd_t *pmd = pmd_offset(pud, start); /* * Not on a 2MB page boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); __unmap_pmd_range(pud, pmd, start, pre_end); start = pre_end; pmd++; } /* * Try to unmap in 2M chunks. */ while (end - start >= PMD_SIZE) { if (pmd_leaf(*pmd)) pmd_clear(pmd); else __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); start += PMD_SIZE; pmd++; } /* * 4K leftovers? */ if (start < end) return __unmap_pmd_range(pud, pmd, start, end); /* * Try again to free the PMD page if haven't succeeded above. */ if (!pud_none(*pud)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? */ if (start & (PUD_SIZE - 1)) { unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); unmap_pmd_range(pud, start, pre_end); start = pre_end; pud++; } /* * Try to unmap in 1G chunks? */ while (end - start >= PUD_SIZE) { if (pud_leaf(*pud)) pud_clear(pud); else unmap_pmd_range(pud, start, start + PUD_SIZE); start += PUD_SIZE; pud++; } /* * 2M leftovers? */ if (start < end) unmap_pmd_range(pud, start, end); /* * No need to try to free the PUD page because we'll free it in * populate_pgd's error path */ } static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); return 0; } static int alloc_pmd_page(pud_t *pud) { pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); return 0; } static void populate_pte(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) { pte_t *pte; pte = pte_offset_kernel(pmd, start); pgprot = pgprot_clear_protnone_bits(pgprot); while (num_pages-- && start < end) { set_pte(pte, pfn_pte(cpa->pfn, pgprot)); start += PAGE_SIZE; cpa->pfn++; pte++; } } static long populate_pmd(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pud_t *pud, pgprot_t pgprot) { long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; /* * Not on a 2M boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long pre_end = start + (num_pages << PAGE_SHIFT); unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; pre_end = min_t(unsigned long, pre_end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(unsigned int, num_pages, cur_pages); /* * Need a PTE page? */ pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); start = pre_end; } /* * We mapped them all? */ if (num_pages == cur_pages) return cur_pages; pmd_pgprot = pgprot_4k_2_large(pgprot); while (end - start >= PMD_SIZE) { /* * We cannot use a 1G page so allocate a PMD page if needed. */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; pmd = pmd_offset(pud, start); set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE >> PAGE_SHIFT; cur_pages += PMD_SIZE >> PAGE_SHIFT; } /* * Map trailing 4K pages. */ if (start < end) { pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, end, num_pages - cur_pages, pmd, pgprot); } return num_pages; } static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, pgprot_t pgprot) { pud_t *pud; unsigned long end; long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); /* * Not on a Gb page boundary? => map everything up to it with * smaller pages. */ if (start & (PUD_SIZE - 1)) { unsigned long pre_end; unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; pre_end = min_t(unsigned long, end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); pud = pud_offset(p4d, start); /* * Need a PMD page? */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, pud, pgprot); if (cur_pages < 0) return cur_pages; start = pre_end; } /* We mapped them all? */ if (cpa->numpages == cur_pages) return cur_pages; pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* * Map everything starting from the Gb boundary, possibly with 1G pages */ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE >> PAGE_SHIFT; cur_pages += PUD_SIZE >> PAGE_SHIFT; pud++; } /* Map trailing leftover */ if (start < end) { long tmp; pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, pud, pgprot); if (tmp < 0) return cur_pages; cur_pages += tmp; } return cur_pages; } /* * Restrictions for kernel page table do not necessarily apply when mapping in * an alternate PGD. */ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); } /* * Allocate a PUD page and hand it down for mapping. */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } cpa->numpages = ret; return 0; } static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { if (cpa->pgd) { /* * Right now, we only execute this code path when mapping * the EFI virtual memory map regions, no other users * provide a ->pgd value. This may change in the future. */ return populate_pgd(cpa, vaddr); } /* * Ignore all non primary paths. */ if (!primary) { cpa->numpages = 1; return 0; } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected * to have holes. * Also set numpages to '1' indicating that we processed cpa req for * one virtual address page and its pfn. TBD: numpages can be set based * on the initial value and the level returned by lookup_address(). */ if (within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { cpa->numpages = 1; cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; return 0; } else if (__cpa_pfn_in_highmap(cpa->pfn)) { /* Faults in the highmap are OK, so do not warn: */ return -EFAULT; } else { WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", vaddr, *cpa->vaddr); return -EFAULT; } } static int __change_page_attr(struct cpa_data *cpa, int primary) { unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; bool nx, rw; address = __cpa_addr(cpa, cpa->curpage); repeat: kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (!kpte) return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; if (pte_none(old_pte)) return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { pte_t new_pte; pgprot_t old_prot = pte_pgprot(old_pte); pgprot_t new_prot = pte_pgprot(old_pte); unsigned long pfn = pte_pfn(old_pte); pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); cpa_inc_4k_install(); /* Hand in lpsize = 0 to enforce the protection mechanism */ new_prot = static_protections(new_prot, address, pfn, 1, 0, CPA_PROTECT); new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1, nx, rw); new_prot = pgprot_clear_protnone_bits(new_prot); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change its attributes * not the memory it points to */ new_pte = pfn_pte(pfn, new_prot); cpa->pfn = pfn; /* * Do we really change anything ? */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; } /* * Check, whether we can keep the large page intact * and just change the pte: */ do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ if (do_split <= 0) return do_split; /* * We have to split the large page: */ err = split_large_page(cpa, kpte, address); if (!err) goto repeat; return err; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); /* * Check the directmap and "high kernel map" 'aliases'. */ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); unsigned long vaddr; int ret; if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; /* * No need to redo, when the primary call touched the direct * mapping already: */ vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* Directmap always has NX set, do not modify. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret; } #ifdef CONFIG_X86_64 /* * If the primary call didn't touch the high mapping already * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && __cpa_pfn_in_highmap(cpa->pfn)) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; alias_cpa.vaddr = &temp_cpa_vaddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* * [_text, _brk_end) also covers data, do not modify NX except * in cases where the highmap is the primary target. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value. */ __change_page_attr_set_clr(&alias_cpa, 0); } #endif return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) { unsigned long numpages = cpa->numpages; unsigned long rempages = numpages; int ret = 0; /* * No changes, easy! */ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && !cpa->force_split) return ret; while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check. */ cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) goto out; if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { ret = cpa_process_alias(cpa); if (ret) goto out; } /* * Adjust the number of pages with the result of the * CPA operation. Either a large page has been * preserved or a single page update happened. */ BUG_ON(cpa->numpages > rempages || !cpa->numpages); rempages -= cpa->numpages; cpa->curpage += cpa->numpages; } out: /* Restore the original numpages */ cpa->numpages = numpages; return ret; } static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, struct page **pages) { struct cpa_data cpa; int ret, cache; memset(&cpa, 0, sizeof(cpa)); /* * Check, if we are requested to set a not supported * feature. Clearing non-supported features is OK. */ mask_set = canon_pgprot(mask_set); if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ if (in_flag & CPA_ARRAY) { int i; for (i = 0; i < numpages; i++) { if (addr[i] & ~PAGE_MASK) { addr[i] &= PAGE_MASK; WARN_ON_ONCE(1); } } } else if (!(in_flag & CPA_PAGES_ARRAY)) { /* * in_flag of CPA_PAGES_ARRAY implies it is aligned. * No need to check in that case */ if (*addr & ~PAGE_MASK) { *addr &= PAGE_MASK; /* * People should not be passing in unaligned addresses: */ WARN_ON_ONCE(1); } } /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); cpa.vaddr = addr; cpa.pages = pages; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flags = in_flag; cpa.curpage = 0; cpa.force_split = force_split; ret = __change_page_attr_set_clr(&cpa, 1); /* * Check whether we really changed something: */ if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* * No need to flush, when we did not set any of the caching * attributes: */ cache = !!pgprot2cachemode(mask_set); /* * On error; flush everything to be sure. */ if (ret) { cpa_flush_all(cache); goto out; } cpa_flush(&cpa, cache); out: return ret; } static inline int change_page_attr_set(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, (array ? CPA_ARRAY : 0), NULL); } static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, (array ? CPA_ARRAY : 0), NULL); } static inline int cpa_set_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, CPA_PAGES_ARRAY, pages); } static inline int cpa_clear_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, CPA_PAGES_ARRAY, pages); } /* * __set_memory_prot is an internal helper for callers that have been passed * a pgprot_t value from upper layers and a reservation has already been taken. * If you want to set the pgprot to a specific page protocol, use the * set_memory_xx() functions. */ int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot) { return change_page_attr_set_clr(&addr, numpages, prot, __pgprot(~pgprot_val(prot)), 0, 0, NULL); } int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap() * If you really need strong UC use ioremap_uc(), but note * that you cannot override IO areas with set_memory_*() as * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) { int ret; /* * for now UC MINUS. see comments in ioremap() */ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; ret = _set_memory_uc(addr, numpages); if (ret) goto out_free; return 0; out_free: memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); out_err: return ret; } EXPORT_SYMBOL(set_memory_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; ret = change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); if (!ret) { ret = change_page_attr_set_clr(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } return ret; } int set_memory_wc(unsigned long addr, int numpages) { int ret; ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) return ret; ret = _set_memory_wc(addr, numpages); if (ret) memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return ret; } EXPORT_SYMBOL(set_memory_wc); int _set_memory_wt(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); } int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) { int ret; ret = _set_memory_wb(addr, numpages); if (ret) return ret; memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return 0; } EXPORT_SYMBOL(set_memory_wb); /* Prevent speculative access to a page by marking it not-present */ #ifdef CONFIG_X86_64 int set_mce_nospec(unsigned long pfn) { unsigned long decoy_addr; int rc; /* SGX pages are not in the 1:1 map */ if (arch_is_platform_page(pfn << PAGE_SHIFT)) return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address * that looks just like the one we want, but has bit 63 flipped. * This relies on set_memory_XX() properly sanitizing any __pa() * results with __PHYSICAL_MASK or PTE_PFN_MASK. */ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); rc = set_memory_np(decoy_addr, 1); if (rc) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); return set_memory_p(addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_nx(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0); } int set_memory_rox(unsigned long addr, int numpages) { pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; return change_page_attr_clear(&addr, numpages, clr, 0); } int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_np(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_np_noalias(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(_PAGE_PRESENT), 0, CPA_NO_CHECK_ALIAS, NULL); } int set_memory_p(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(0), 1, 0, NULL); } int set_memory_nonglobal(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } int set_memory_global(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } /* * __set_memory_enc_pgtable() is used for the hypervisors that get * informed about "encryption" status via page tables. */ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) { pgprot_t empty = __pgprot(0); struct cpa_data cpa; int ret; /* Should not be working on unaligned addresses */ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; memset(&cpa, 0, sizeof(cpa)); cpa.vaddr = &addr; cpa.numpages = numpages; cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty); cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty); cpa.pgd = init_mm.pgd; /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); /* Flush the caches as needed before changing the encryption attribute. */ if (x86_platform.guest.enc_tlb_flush_required(enc)) cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); /* Notify hypervisor that we are about to set/clr encryption attribute. */ ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc); if (ret) goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); /* * After changing the encryption attribute, we need to flush TLBs again * in case any speculative TLB caching occurred (but no need to flush * caches again). We could just use cpa_flush_all(), but in case TLB * flushing gets optimized in the cpa_flush() path use the same logic * as above. */ cpa_flush(&cpa, 0); if (ret) return ret; /* Notify hypervisor that we have successfully set/clr encryption attribute. */ ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc); if (ret) goto vmm_fail; return 0; vmm_fail: WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n", (void *)addr, numpages, enc ? "private" : "shared", ret); return ret; } /* * The lock serializes conversions between private and shared memory. * * It is taken for read on conversion. A write lock guarantees that no * concurrent conversions are in progress. */ static DECLARE_RWSEM(mem_enc_lock); /* * Stop new private<->shared conversions. * * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete. * The lock is not released to prevent new conversions from being started. */ bool set_memory_enc_stop_conversion(void) { /* * In a crash scenario, sleep is not allowed. Try to take the lock. * Failure indicates that there is a race with the conversion. */ if (oops_in_progress) return down_write_trylock(&mem_enc_lock); down_write(&mem_enc_lock); return true; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { int ret = 0; if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { if (!down_read_trylock(&mem_enc_lock)) return -EBUSY; ret = __set_memory_enc_pgtable(addr, numpages, enc); up_read(&mem_enc_lock); } return ret; } int set_memory_encrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, true); } EXPORT_SYMBOL_GPL(set_memory_encrypted); int set_memory_decrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, false); } EXPORT_SYMBOL_GPL(set_memory_decrypted); int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_pages_uc); static int _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type) { unsigned long start; unsigned long end; enum page_cache_mode set_type; int i; int free_idx; int ret; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (memtype_reserve(start, end, new_type, NULL)) goto err_out; } /* If WC, set to UC- first and then WC */ set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; ret = cpa_set_pages_array(pages, numpages, cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_PAGES_ARRAY, pages); if (ret) goto err_out; return 0; /* Success */ err_out: free_idx = i; for (i = 0; i < free_idx; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return -EINVAL; } int set_pages_array_uc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); int set_pages_array_wc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_pages_wb); int set_pages_array_wb(struct page **pages, int numpages) { int retval; unsigned long start; unsigned long end; int i; /* WB cache mode is hard wired to all cache attribute bits being 0 */ retval = cpa_clear_pages_array(pages, numpages, __pgprot(_PAGE_CACHE_MASK)); if (retval) return retval; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return 0; } EXPORT_SYMBOL(set_pages_array_wb); int set_pages_ro(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_ro(addr, numpages); } int set_pages_rw(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_rw(addr, numpages); } static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting not present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } int set_direct_map_invalid_noflush(struct page *page) { return __set_pages_np(page, 1); } int set_direct_map_default_noflush(struct page *page) { return __set_pages_p(page, 1); } int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) { if (valid) return __set_pages_p(page, nr); return __set_pages_np(page, nr); } #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; if (!enable) { debug_check_no_locks_freed(page_address(page), numpages * PAGE_SIZE); } /* * The return value is ignored as the calls cannot fail. * Large pages for identity mappings are not used at boot time * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); else __set_pages_np(page, numpages); /* * We should perform an IPI and flush all tlbs, * but that can deadlock->flush only current cpu. * Preemption needs to be disabled around __flush_tlb_all() due to * CR3 reload in __native_flush_tlb(). */ preempt_disable(); __flush_tlb_all(); preempt_enable(); arch_flush_lazy_mmu_mode(); } #endif /* CONFIG_DEBUG_PAGEALLOC */ bool kernel_page_present(struct page *page) { unsigned int level; pte_t *pte; if (PageHighMem(page)) return false; pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) { int retval = -EINVAL; struct cpa_data cpa = { .vaddr = &address, .pfn = pfn, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); if (!(__supported_pte_mask & _PAGE_NX)) goto out; if (!(page_flags & _PAGE_ENC)) cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); out: return retval; } /* * __flush_tlb_all() flushes mappings only on current CPU and hence this * function shouldn't be used in an SMP environment. Presently, it's used only * during boot (way before smp_init()) by EFI subsystem and hence is ok. */ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages) { int retval; /* * The typical sequence for unmapping is to find a pte through * lookup_address_in_pgd() (ideally, it should never return NULL because * the address is already mapped) and change its protections. As pfn is * the *target* of a mapping, it's not useful while unmapping. */ struct cpa_data cpa = { .vaddr = &address, .pfn = 0, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); return retval; } /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. */ #ifdef CONFIG_CPA_DEBUG #include "cpa-test.c" #endif |
4 160 242 217 506 564 903 105 45 91 44 205 105 70 204 44 148 128 15 83 69 523 85 444 434 228 107 106 81 50 431 57 69 213 136 5 34 31 331 332 347 319 22 40 17 22 49 49 323 321 10 4 661 282 661 284 1 462 301 998 6 10 57 81 590 118 167 1 169 164 883 587 653 284 285 1135 1134 12 12 12 6 11 11 42 18 18 11 28 28 11 6 484 52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ext4 #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EXT4_H #include <linux/writeback.h> #include <linux/tracepoint.h> struct ext4_allocation_context; struct ext4_allocation_request; struct ext4_extent; struct ext4_prealloc_space; struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) #define show_mballoc_flags(flags) __print_flags(flags, "|", \ { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \ { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \ { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \ { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \ { EXT4_MB_HINT_BEST, "HINT_BEST" }, \ { EXT4_MB_HINT_DATA, "HINT_DATA" }, \ { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \ { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \ { EXT4_MB_HINT_GOAL_ONLY, "HINT_GOAL_ONLY" }, \ { EXT4_MB_HINT_TRY_GOAL, "HINT_TRY_GOAL" }, \ { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ { EXT4_MB_USE_RESERVED, "USE_RESV" }, \ { EXT4_MB_STRICT_CHECK, "STRICT_CHECK" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \ { EXT4_GET_BLOCKS_ZERO, "ZERO" }, \ { EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \ { EXT4_EX_NOCACHE, "EX_NOCACHE" }) /* * __print_flags() requires that all enum values be wrapped in the * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace * ring buffer. */ TRACE_DEFINE_ENUM(BH_New); TRACE_DEFINE_ENUM(BH_Mapped); TRACE_DEFINE_ENUM(BH_Unwritten); TRACE_DEFINE_ENUM(BH_Boundary); #define show_mflags(flags) __print_flags(flags, "", \ { EXT4_MAP_NEW, "N" }, \ { EXT4_MAP_MAPPED, "M" }, \ { EXT4_MAP_UNWRITTEN, "U" }, \ { EXT4_MAP_BOUNDARY, "B" }) #define show_free_flags(flags) __print_flags(flags, "|", \ { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \ { EXT4_FREE_BLOCKS_FORGET, "FORGET" }, \ { EXT4_FREE_BLOCKS_VALIDATED, "VALIDATED" }, \ { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE, "NO_QUOTA" }, \ { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\ { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER, "LAST_CLUSTER" }) TRACE_DEFINE_ENUM(ES_WRITTEN_B); TRACE_DEFINE_ENUM(ES_UNWRITTEN_B); TRACE_DEFINE_ENUM(ES_DELAYED_B); TRACE_DEFINE_ENUM(ES_HOLE_B); TRACE_DEFINE_ENUM(ES_REFERENCED_B); #define show_extent_status(status) __print_flags(status, "", \ { EXTENT_STATUS_WRITTEN, "W" }, \ { EXTENT_STATUS_UNWRITTEN, "U" }, \ { EXTENT_STATUS_DELAYED, "D" }, \ { EXTENT_STATUS_HOLE, "H" }, \ { EXTENT_STATUS_REFERENCED, "R" }) #define show_falloc_mode(mode) __print_flags(mode, "|", \ { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM); TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT); TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); #define show_fc_reason(reason) \ __print_symbolic(reason, \ { EXT4_FC_REASON_XATTR, "XATTR"}, \ { EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \ { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \ { EXT4_FC_REASON_NOMEM, "NO_MEM"}, \ { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN); TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW); TRACE_DEFINE_ENUM(CR_ANY_FREE); #define show_criteria(cr) \ __print_symbolic(cr, \ { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" }, \ { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" }, \ { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" }, \ { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" }, \ { CR_ANY_FREE, "CR_ANY_FREE" }) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), TP_ARGS(inode, orig_ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, orig_ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u16, mode ) ), TP_fast_assign( __entry->orig_ino = orig_ino; __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->orig_ino, (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid) ); TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u64, blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->blocks = inode->i_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid, __entry->blocks) ); TRACE_EVENT(ext4_request_inode, TP_PROTO(struct inode *dir, int mode), TP_ARGS(dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, TP_PROTO(struct inode *inode, struct inode *dir, int mode), TP_ARGS(inode, dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_evict_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, nlink ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nlink = inode->i_nlink; ), TP_printk("dev %d,%d ino %lu nlink %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nlink) ); TRACE_EVENT(ext4_drop_inode, TP_PROTO(struct inode *inode, int drop), TP_ARGS(inode, drop), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, drop ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->drop = drop; ), TP_printk("dev %d,%d ino %lu drop %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->drop) ); TRACE_EVENT(ext4_nfs_commit_metadata, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mark_inode_dirty, TP_PROTO(struct inode *inode, unsigned long IP), TP_ARGS(inode, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, ip ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ip = IP; ), TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_begin_ordered_truncate, TP_PROTO(struct inode *inode, loff_t new_size), TP_ARGS(inode, new_size), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, new_size ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->new_size = new_size; ), TP_printk("dev %d,%d ino %lu new_size %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->new_size) ); DECLARE_EVENT_CLASS(ext4__write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; ), TP_printk("dev %d,%d ino %lu pos %lld len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len) ); DEFINE_EVENT(ext4__write_begin, ext4_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len) ); DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len) ); DECLARE_EVENT_CLASS(ext4__write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) __field( unsigned int, copied ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->copied) ); DEFINE_EVENT(ext4__write_end, ext4_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_da_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); TRACE_EVENT(ext4_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( long, nr_to_write ) __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) __field( char, for_kupdate ) __field( char, range_cyclic ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->range_cyclic = wbc->range_cyclic; ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " "range_start %lld range_end %lld sync_mode %d " "for_kupdate %d range_cyclic %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_pages, TP_PROTO(struct inode *inode, pgoff_t first_page, struct writeback_control *wbc), TP_ARGS(inode, first_page, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, first_page ) __field( long, nr_to_write ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->first_page = first_page; __entry->nr_to_write = wbc->nr_to_write; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " "sync_mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->first_page, __entry->nr_to_write, __entry->sync_mode) ); TRACE_EVENT(ext4_da_write_pages_extent, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map), TP_ARGS(inode, map), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, lblk ) __field( __u32, len ) __field( __u32, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->flags = map->m_flags; ), TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_mflags(__entry->flags)) ); TRACE_EVENT(ext4_writepages_result, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int ret, int pages_written), TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) __field( long, pages_skipped ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " "sync_mode %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->sync_mode, (unsigned long) __entry->writeback_index) ); DECLARE_EVENT_CLASS(ext4__folio_op, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->index = folio->index; ), TP_printk("dev %d,%d ino %lu folio_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index) ); DEFINE_EVENT(ext4__folio_op, ext4_read_folio, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio) ); DEFINE_EVENT(ext4__folio_op, ext4_release_folio, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio) ); DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) __field( size_t, offset ) __field( size_t, length ) ), TP_fast_assign( __entry->dev = folio->mapping->host->i_sb->s_dev; __entry->ino = folio->mapping->host->i_ino; __entry->index = folio->index; __entry->offset = offset; __entry->length = length; ), TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index, __entry->offset, __entry->length) ); DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length) ); DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length) ); TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), TP_ARGS(sb, blk, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, blk ) __field( __u64, count ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->blk = blk; __entry->count = count; ), TP_printk("dev %d,%d blk %llu count %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blk, __entry->count) ); DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u64, pa_lstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = ac->ac_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; __entry->pa_lstart = pa->pa_lstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); TRACE_EVENT(ext4_mb_release_inode_pa, TP_PROTO(struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), TP_ARGS(pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( __u32, count ) ), TP_fast_assign( __entry->dev = pa->pa_inode->i_sb->s_dev; __entry->ino = pa->pa_inode->i_ino; __entry->block = block; __entry->count = count; ), TP_printk("dev %d,%d ino %lu block %llu count %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa), TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d pstart %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->pa_pstart, __entry->pa_len) ); TRACE_EVENT(ext4_discard_preallocations, TP_PROTO(struct inode *inode, unsigned int len), TP_ARGS(inode, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->len = len; ), TP_printk("dev %d,%d ino %lu len: %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->len) ); TRACE_EVENT(ext4_mb_discard_preallocations, TP_PROTO(struct super_block *sb, int needed), TP_ARGS(sb, needed), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, needed ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->needed = needed; ), TP_printk("dev %d,%d needed %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->needed) ); TRACE_EVENT(ext4_request_blocks, TP_PROTO(struct ext4_allocation_request *ar), TP_ARGS(ar), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu " "lleft %u lright %u pleft %llu pright %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_allocate_blocks, TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), TP_ARGS(ar, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->block = block; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u " "goal %llu lleft %u lright %u pleft %llu pright %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->block, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_free_blocks, TP_PROTO(struct inode *inode, __u64 block, unsigned long count, int flags), TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned long, count ) __field( int, flags ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; __entry->flags = flags; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->block, __entry->count, show_free_flags(__entry->flags)) ); TRACE_EVENT(ext4_sync_file_enter, TP_PROTO(struct file *file, int datasync), TP_ARGS(file, datasync), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) ), TP_fast_assign( struct dentry *dentry = file->f_path.dentry; __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->datasync = datasync; __entry->parent = d_inode(dentry->d_parent)->i_ino; ), TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); TRACE_EVENT(ext4_sync_file_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); TRACE_EVENT(ext4_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, wait ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->wait = wait; ), TP_printk("dev %d,%d wait %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->wait) ); TRACE_EVENT(ext4_alloc_da_blocks, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, data_blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; ), TP_printk("dev %d,%d ino %lu reserved_data_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->data_blocks) ); TRACE_EVENT(ext4_mballoc_alloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, goal_logical ) __field( int, goal_start ) __field( __u32, goal_group ) __field( int, goal_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) __field( __u16, found ) __field( __u16, groups ) __field( __u16, buddy ) __field( __u16, flags ) __field( __u16, tail ) __field( __u8, cr ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->goal_logical = ac->ac_g_ex.fe_logical; __entry->goal_start = ac->ac_g_ex.fe_start; __entry->goal_group = ac->ac_g_ex.fe_group; __entry->goal_len = ac->ac_g_ex.fe_len; __entry->result_logical = ac->ac_f_ex.fe_logical; __entry->result_start = ac->ac_f_ex.fe_start; __entry->result_group = ac->ac_f_ex.fe_group; __entry->result_len = ac->ac_f_ex.fe_len; __entry->found = ac->ac_found; __entry->flags = ac->ac_flags; __entry->groups = ac->ac_groups_scanned; __entry->buddy = ac->ac_buddy; __entry->tail = ac->ac_tail; __entry->cr = ac->ac_criteria; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %s flags %s " "tail %u broken %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->goal_group, __entry->goal_start, __entry->goal_len, __entry->goal_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical, __entry->found, __entry->groups, show_criteria(__entry->cr), show_mballoc_flags(__entry->flags), __entry->tail, __entry->buddy ? 1 << __entry->buddy : 0) ); TRACE_EVENT(ext4_mballoc_prealloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->result_logical = ac->ac_b_ex.fe_logical; __entry->result_start = ac->ac_b_ex.fe_start; __entry->result_group = ac->ac_b_ex.fe_group; __entry->result_len = ac->ac_b_ex.fe_len; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical) ); DECLARE_EVENT_CLASS(ext4__mballoc, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = inode ? inode->i_ino : 0; __entry->result_start = start; __entry->result_group = group; __entry->result_len = len; ), TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); TRACE_EVENT(ext4_forget, TP_PROTO(struct inode *inode, int is_metadata, __u64 block), TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( int, is_metadata ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->is_metadata = is_metadata; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->is_metadata, __entry->block) ); TRACE_EVENT(ext4_da_update_reserve_space, TP_PROTO(struct inode *inode, int used_blocks, int quota_claim), TP_ARGS(inode, used_blocks, quota_claim), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) __field( int, quota_claim ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->used_blocks = used_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->quota_claim = quota_claim; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " "reserved_data_blocks %d quota_claim %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->quota_claim) ); TRACE_EVENT(ext4_da_reserve_space, TP_PROTO(struct inode *inode, int nr_resv), TP_ARGS(inode, nr_resv), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, reserve_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->reserve_blocks = nr_resv; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d" "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->reserve_blocks, __entry->reserved_data_blocks) ); TRACE_EVENT(ext4_da_release_space, TP_PROTO(struct inode *inode, int freed_blocks), TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->freed_blocks = freed_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks) ); DECLARE_EVENT_CLASS(ext4__bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); TRACE_EVENT(ext4_read_block_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch), TP_ARGS(sb, group, prefetch), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( bool, prefetch ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->prefetch = prefetch; ), TP_printk("dev %d,%d group %u prefetch %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->prefetch) ); DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, offset ) __field( loff_t, len ) __field( int, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len, show_falloc_mode(__entry->mode)) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); TRACE_EVENT(ext4_fallocate_exit, TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret), TP_ARGS(inode, offset, max_blocks, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, blocks ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->blocks = max_blocks; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->blocks, __entry->ret) ); TRACE_EVENT(ext4_unlink_enter, TP_PROTO(struct inode *parent, struct dentry *dentry), TP_ARGS(parent, dentry), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( loff_t, size ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->parent = parent->i_ino; __entry->size = d_inode(dentry)->i_size; ), TP_printk("dev %d,%d ino %lu size %lld parent %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->size, (unsigned long) __entry->parent) ); TRACE_EVENT(ext4_unlink_exit, TP_PROTO(struct dentry *dentry, int ret), TP_ARGS(dentry, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); DECLARE_EVENT_CLASS(ext4__truncate, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->blocks = inode->i_blocks; ), TP_printk("dev %d,%d ino %lu blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->blocks) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* 'ux' is the unwritten extent. */ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux), TP_ARGS(inode, map, ux), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u " "u_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk) ); /* * 'ux' is the unwritten extent. * 'ix' is the initialized extent to which blocks are transferred. */ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux, struct ext4_extent *ix), TP_ARGS(inode, map, ux, ix), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) __field( ext4_lblk_t, i_lblk ) __field( unsigned, i_len ) __field( ext4_fsblk_t, i_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); __entry->i_lblk = le32_to_cpu(ix->ee_block); __entry->i_len = ext4_ext_get_actual_len(ix); __entry->i_pblk = ext4_ext_pblock(ix); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u " "u_lblk %u u_len %u u_pblk %llu " "i_lblk %u i_len %u i_pblk %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk, __entry->i_lblk, __entry->i_len, __entry->i_pblk) ); DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, unsigned int flags), TP_ARGS(inode, lblk, len, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; __entry->flags = flags; ), TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_map_flags(__entry->flags)) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, flags ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, mflags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->pblk = map->m_pblk; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->mflags = map->m_flags; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u " "mflags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_map_flags(__entry->flags), __entry->lblk, __entry->pblk, __entry->len, show_mflags(__entry->mflags), __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); TRACE_EVENT(ext4_ext_load_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), TP_ARGS(inode, lblk, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk) ); TRACE_EVENT(ext4_load_inode, TP_PROTO(struct super_block *sb, unsigned long ino), TP_ARGS(sb, ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = ino; ), TP_printk("dev %d,%d ino %ld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_journal_start_sb, TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks, int revoke_creds, int type, unsigned long IP), TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, type, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) __field( int, revoke_creds ) __field( int, type ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; __entry->revoke_creds = revoke_creds; __entry->type = type; ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d," " type %d, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, __entry->type, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_inode, TP_PROTO(struct inode *inode, int blocks, int rsv_blocks, int revoke_creds, int type, unsigned long IP), TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP), TP_STRUCT__entry( __field( unsigned long, ino ) __field( dev_t, dev ) __field( unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) __field( int, revoke_creds ) __field( int, type ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; __entry->revoke_creds = revoke_creds; __entry->type = type; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d," " type %d, ino %lu, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, __entry->type, __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_reserved, TP_PROTO(struct super_block *sb, int blocks, unsigned long IP), TP_ARGS(sb, blocks, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, ip ) __field( int, blocks ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; ), TP_printk("dev %d,%d blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, (void *)__entry->ip) ); DECLARE_EVENT_CLASS(ext4__trim, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len), TP_STRUCT__entry( __field( int, dev_major ) __field( int, dev_minor ) __field( __u32, group ) __field( int, start ) __field( int, len ) ), TP_fast_assign( __entry->dev_major = MAJOR(sb->s_dev); __entry->dev_minor = MINOR(sb->s_dev); __entry->group = group; __entry->start = start; __entry->len = len; ), TP_printk("dev %d,%d group %u, start %d, len %d", __entry->dev_major, __entry->dev_minor, __entry->group, __entry->start, __entry->len) ); DEFINE_EVENT(ext4__trim, ext4_trim_extent, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); DEFINE_EVENT(ext4__trim, ext4_trim_all_free, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); TRACE_EVENT(ext4_ext_handle_unwritten_extents, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( unsigned int, allocated ) __field( ext4_fsblk_t, newblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->allocated = allocated; __entry->newblk = newblock; ), TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_map_flags(__entry->flags), (unsigned int) __entry->allocated, (unsigned long long) __entry->newblk) ); TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret), TP_ARGS(sb, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( int, ret ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = map->m_flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->ret = ret; ), TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_mflags(__entry->flags), __entry->ret) ); TRACE_EVENT(ext4_ext_show_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, unsigned short len), TP_ARGS(inode, lblk, pblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned short, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, (unsigned short) __entry->len) ); TRACE_EVENT(ext4_remove_blocks, TP_PROTO(struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_fsblk_t to, struct partial_cluster *pc), TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, struct partial_cluster *pc), TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, TP_PROTO(struct inode *inode, ext4_fsblk_t pblk), TP_ARGS(inode, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; ), TP_printk("dev %d,%d ino %lu index_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long long) __entry->pblk) ); TRACE_EVENT(ext4_ext_remove_space, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth), TP_ARGS(inode, start, end, depth), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth) ); TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth, struct partial_cluster *pc, __le16 eh_entries), TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state ) __field( unsigned short, eh_entries ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d " "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); DECLARE_EVENT_CLASS(ext4__es_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); TRACE_EVENT(ext4_es_remove_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), TP_ARGS(inode, lblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, lblk ) __field( loff_t, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu es [%lld/%lld)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len) ); TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); TRACE_EVENT(ext4_es_lookup_extent_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_lookup_extent_exit, TP_PROTO(struct inode *inode, struct extent_status *es, int found), TP_ARGS(inode, es, found), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( int, found ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->found = found; ), TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->found, __entry->lblk, __entry->len, __entry->found ? __entry->pblk : 0, show_extent_status(__entry->found ? __entry->status : 0)) ); DECLARE_EVENT_CLASS(ext4__es_shrink_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_to_scan ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_to_scan = nr_to_scan; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); TRACE_EVENT(ext4_es_shrink_scan_exit, TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), TP_ARGS(sb, nr_shrunk, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->cache_cnt) ); TRACE_EVENT(ext4_collapse_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_insert_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_es_shrink, TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, int nr_skipped, int retried), TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( unsigned long long, scan_time ) __field( int, nr_skipped ) __field( int, retried ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->scan_time = div_u64(scan_time, 1000); __entry->nr_skipped = nr_skipped; __entry->retried = retried; ), TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu " "nr_skipped %d retried %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); TRACE_EVENT(ext4_es_insert_delayed_extent, TP_PROTO(struct inode *inode, struct extent_status *es, bool lclu_allocated, bool end_allocated), TP_ARGS(inode, es, lclu_allocated, end_allocated), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( bool, lclu_allocated ) __field( bool, end_allocated ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->lclu_allocated = lclu_allocated; __entry->end_allocated = end_allocated; ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " "allocated %d %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), __entry->lclu_allocated, __entry->end_allocated) ); /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, u64 owner), TP_ARGS(sb, keydev, agno, bno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u32, agno) __field(u64, bno) __field(u64, len) __field(u64, owner) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(keydev); __entry->agno = agno; __entry->bno = bno; __entry->len = len; __entry->owner = owner; ), TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, __entry->bno, __entry->len, __entry->owner) ) #define DEFINE_FSMAP_EVENT(name) \ DEFINE_EVENT(ext4_fsmap_class, name, \ TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \ u64 owner), \ TP_ARGS(sb, keydev, agno, bno, len, owner)) DEFINE_FSMAP_EVENT(ext4_fsmap_low_key); DEFINE_FSMAP_EVENT(ext4_fsmap_high_key); DEFINE_FSMAP_EVENT(ext4_fsmap_mapping); DECLARE_EVENT_CLASS(ext4_getfsmap_class, TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), TP_ARGS(sb, fsmap), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u64, block) __field(u64, len) __field(u64, owner) __field(u64, flags) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(fsmap->fmr_device); __entry->block = fsmap->fmr_physical; __entry->len = fsmap->fmr_length; __entry->owner = fsmap->fmr_owner; __entry->flags = fsmap->fmr_flags; ), TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->block, __entry->len, __entry->owner, __entry->flags) ) #define DEFINE_GETFSMAP_EVENT(name) \ DEFINE_EVENT(ext4_getfsmap_class, name, \ TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \ TP_ARGS(sb, fsmap)) DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping); TRACE_EVENT(ext4_shutdown, TP_PROTO(struct super_block *sb, unsigned long flags), TP_ARGS(sb, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned, flags ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = flags; ), TP_printk("dev %d,%d flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags) ); TRACE_EVENT(ext4_error, TP_PROTO(struct super_block *sb, const char *function, unsigned int line), TP_ARGS(sb, function, line), TP_STRUCT__entry( __field( dev_t, dev ) __field( const char *, function ) __field( unsigned, line ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->function = function; __entry->line = line; ), TP_printk("dev %d,%d function %s line %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->function, __entry->line) ); TRACE_EVENT(ext4_prefetch_bitmaps, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_group_t next, unsigned int prefetch_ios), TP_ARGS(sb, group, next, prefetch_ios), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( __u32, next ) __field( __u32, ios ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->next = next; __entry->ios = prefetch_ios; ), TP_printk("dev %d,%d group %u next %u ios %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->next, __entry->ios) ); TRACE_EVENT(ext4_lazy_itable_init, TP_PROTO(struct super_block *sb, ext4_group_t group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); TRACE_EVENT(ext4_fc_replay_scan, TP_PROTO(struct super_block *sb, int error, int off), TP_ARGS(sb, error, off), TP_STRUCT__entry( __field(dev_t, dev) __field(int, error) __field(int, off) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->error = error; __entry->off = off; ), TP_printk("dev %d,%d error %d, off %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->error, __entry->off) ); TRACE_EVENT(ext4_fc_replay, TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2), TP_ARGS(sb, tag, ino, priv1, priv2), TP_STRUCT__entry( __field(dev_t, dev) __field(int, tag) __field(int, ino) __field(int, priv1) __field(int, priv2) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->tag = tag; __entry->ino = ino; __entry->priv1 = priv1; __entry->priv2 = priv2; ), TP_printk("dev %d,%d: tag %d, ino %d, data1 %d, data2 %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tag, __entry->ino, __entry->priv1, __entry->priv2) ); TRACE_EVENT(ext4_fc_commit_start, TP_PROTO(struct super_block *sb, tid_t commit_tid), TP_ARGS(sb, commit_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, tid) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->tid = commit_tid; ), TP_printk("dev %d,%d tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid) ); TRACE_EVENT(ext4_fc_commit_stop, TP_PROTO(struct super_block *sb, int nblks, int reason, tid_t commit_tid), TP_ARGS(sb, nblks, reason, commit_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(int, nblks) __field(int, reason) __field(int, num_fc) __field(int, num_fc_ineligible) __field(int, nblks_agg) __field(tid_t, tid) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nblks = nblks; __entry->reason = reason; __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits; __entry->num_fc_ineligible = EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks; __entry->tid = commit_tid; ), TP_printk("dev %d,%d nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d, tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nblks, __entry->reason, __entry->num_fc, __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid) ); #define FC_REASON_NAME_STAT(reason) \ show_fc_reason(reason), \ __entry->fc_ineligible_rc[reason] TRACE_EVENT(ext4_fc_stats, TP_PROTO(struct super_block *sb), TP_ARGS(sb), TP_STRUCT__entry( __field(dev_t, dev) __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX) __field(unsigned long, fc_commits) __field(unsigned long, fc_ineligible_commits) __field(unsigned long, fc_numblks) ), TP_fast_assign( int i; __entry->dev = sb->s_dev; for (i = 0; i < EXT4_FC_REASON_MAX; i++) { __entry->fc_ineligible_rc[i] = EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i]; } __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits; __entry->fc_ineligible_commits = EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks; ), TP_printk("dev %d,%d fc ineligible reasons:\n" "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u" "num_commits:%lu, ineligible: %lu, numblks: %lu", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME), __entry->fc_commits, __entry->fc_ineligible_commits, __entry->fc_numblks) ); DECLARE_EVENT_CLASS(ext4_fc_track_dentry, TP_PROTO(handle_t *handle, struct inode *inode, struct dentry *dentry, int ret), TP_ARGS(handle, inode, dentry, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->error = ret; ), TP_printk("dev %d,%d, t_tid %u, ino %lu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error ) ); #define DEFINE_EVENT_CLASS_DENTRY(__type) \ DEFINE_EVENT(ext4_fc_track_dentry, ext4_fc_track_##__type, \ TP_PROTO(handle_t *handle, struct inode *inode, \ struct dentry *dentry, int ret), \ TP_ARGS(handle, inode, dentry, ret) \ ) DEFINE_EVENT_CLASS_DENTRY(create); DEFINE_EVENT_CLASS_DENTRY(link); DEFINE_EVENT_CLASS_DENTRY(unlink); TRACE_EVENT(ext4_fc_track_inode, TP_PROTO(handle_t *handle, struct inode *inode, int ret), TP_ARGS(handle, inode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->error = ret; ), TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error) ); TRACE_EVENT(ext4_fc_track_range, TP_PROTO(handle_t *handle, struct inode *inode, long start, long end, int ret), TP_ARGS(handle, inode, start, end, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(long, start) __field(long, end) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->start = start; __entry->end = end; __entry->error = ret; ), TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d, start %ld, end %ld", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error, __entry->start, __entry->end) ); TRACE_EVENT(ext4_fc_cleanup, TP_PROTO(journal_t *journal, int full, tid_t tid), TP_ARGS(journal, full, tid), TP_STRUCT__entry( __field(dev_t, dev) __field(int, j_fc_off) __field(int, full) __field(tid_t, tid) ), TP_fast_assign( struct super_block *sb = journal->j_private; __entry->dev = sb->s_dev; __entry->j_fc_off = journal->j_fc_off; __entry->full = full; __entry->tid = tid; ), TP_printk("dev %d,%d, j_fc_off %d, full %d, tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->j_fc_off, __entry->full, __entry->tid) ); TRACE_EVENT(ext4_update_sb, TP_PROTO(struct super_block *sb, ext4_fsblk_t fsblk, unsigned int flags), TP_ARGS(sb, fsblk, flags), TP_STRUCT__entry( __field(dev_t, dev) __field(ext4_fsblk_t, fsblk) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->fsblk = fsblk; __entry->flags = flags; ), TP_printk("dev %d,%d fsblk %llu flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fsblk, __entry->flags) ); #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
20 7 7 7 20 19 20 27 28 1 1 19 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 13 13 19 19 19 1 1 19 19 19 19 19 19 19 19 19 19 19 13 6 19 19 19 19 19 19 19 19 19 6 13 13 19 19 18 19 19 19 19 19 19 19 18 19 19 19 19 19 19 19 19 19 19 19 19 19 19 28 21 7 2 19 19 19 19 19 19 19 27 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 | // SPDX-License-Identifier: GPL-2.0 /* * bcachefs setup/teardown code, and some metadata io - read a superblock and * figure out what to do with it. * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_write_buffer.h" #include "buckets_waiting_for_journal.h" #include "chardev.h" #include "checksum.h" #include "clock.h" #include "compress.h" #include "debug.h" #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" #include "error.h" #include "fs.h" #include "fs-io.h" #include "fs-io-buffered.h" #include "fs-io-direct.h" #include "fsck.h" #include "inode.h" #include "io_read.h" #include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "move.h" #include "migrate.h" #include "movinggc.h" #include "nocow_locking.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" #include "replicas.h" #include "sb-clean.h" #include "sb-counters.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" #include "subvolume.h" #include "super.h" #include "super-io.h" #include "sysfs.h" #include "thread_with_file.h" #include "trace.h" #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/sysfs.h> #include <crypto/hash.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); MODULE_SOFTDEP("pre: crc32c"); MODULE_SOFTDEP("pre: crc64"); MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); const char * const bch2_fs_flag_strs[] = { #define x(n) #n, BCH_FS_FLAGS() #undef x NULL }; void bch2_print_str(struct bch_fs *c, const char *str) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); if (unlikely(stdio)) { bch2_stdio_redirect_printf(stdio, true, "%s", str); return; } #endif bch2_print_string_as_lines(KERN_ERR, str); } __printf(2, 0) static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) { #ifdef __KERNEL__ if (unlikely(stdio)) { if (fmt[0] == KERN_SOH[0]) fmt += 2; bch2_stdio_redirect_vprintf(stdio, true, fmt, args); return; } #endif vprintk(fmt, args); } void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) { struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; va_list args; va_start(args, fmt); bch2_print_maybe_redirect(stdio, fmt, args); va_end(args); } void __bch2_print(struct bch_fs *c, const char *fmt, ...) { struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); va_list args; va_start(args, fmt); bch2_print_maybe_redirect(stdio, fmt, args); va_end(args); } #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ .attrs = type ## _files \ }; \ \ static const struct attribute_group *type ## _groups[] = { \ &type ## _group, \ NULL \ }; \ \ static const struct kobj_type type ## _ktype = { \ .release = type ## _release, \ .sysfs_ops = &type ## _sysfs_ops, \ .default_groups = type ## _groups \ } static void bch2_fs_release(struct kobject *); static void bch2_dev_release(struct kobject *); static void bch2_fs_counters_release(struct kobject *k) { } static void bch2_fs_internal_release(struct kobject *k) { } static void bch2_fs_opts_dir_release(struct kobject *k) { } static void bch2_fs_time_stats_release(struct kobject *k) { } KTYPE(bch2_fs); KTYPE(bch2_fs_counters); KTYPE(bch2_fs_internal); KTYPE(bch2_fs_opts_dir); KTYPE(bch2_fs_time_stats); KTYPE(bch2_dev); static struct kset *bcachefs_kset; static LIST_HEAD(bch_fs_list); static DEFINE_MUTEX(bch_fs_list_lock); DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { struct bch_fs *c; mutex_lock(&bch_fs_list_lock); rcu_read_lock(); list_for_each_entry(c, &bch_fs_list, list) for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); goto found; } c = NULL; found: rcu_read_unlock(); mutex_unlock(&bch_fs_list_lock); return c; } static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) { struct bch_fs *c; lockdep_assert_held(&bch_fs_list_lock); list_for_each_entry(c, &bch_fs_list, list) if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) return c; return NULL; } struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) { struct bch_fs *c; mutex_lock(&bch_fs_list_lock); c = __bch2_uuid_to_fs(uuid); if (c) closure_get(&c->cl); mutex_unlock(&bch_fs_list_lock); return c; } /* Filesystem RO/RW: */ /* * For startup/shutdown of RW stuff, the dependencies are: * * - foreground writes depend on copygc and rebalance (to free up space) * * - copygc and rebalance depend on mark and sweep gc (they actually probably * don't because they either reserve ahead of time or don't block if * allocations fail, but allocations can require mark and sweep gc to run * because of generation number wraparound) * * - all of the above depends on the allocator threads * * - allocator depends on the journal (when it rewrites prios and gens) */ static void __bch2_fs_read_only(struct bch_fs *c) { unsigned clean_passes = 0; u64 seq = 0; bch2_fs_ec_stop(c); bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); bch2_fs_ec_flush(c); bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", journal_cur_seq(&c->journal)); do { clean_passes++; if (bch2_btree_interior_updates_flush(c) || bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || bch2_btree_flush_all_writes(c) || seq != atomic64_read(&c->journal.seq)) { seq = atomic64_read(&c->journal.seq); clean_passes = 0; } } while (clean_passes < 2); bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_replay_done, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); bch2_fs_journal_stop(&c->journal); bch_info(c, "%sshutdown complete, journal seq %llu", test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", c->journal.seq_ondisk); /* * After stopping journal: */ for_each_member_device(c, ca) bch2_dev_allocator_remove(c, ca); } #ifndef BCH_WRITE_REF_DEBUG static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } #endif void bch2_fs_read_only(struct bch_fs *c) { if (!test_bit(BCH_FS_rw, &c->flags)) { bch2_journal_reclaim_stop(&c->journal); return; } BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); bch_verbose(c, "going read-only"); /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ set_bit(BCH_FS_going_ro, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); #else for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) bch2_write_ref_put(c, i); #endif /* * If we're not doing an emergency shutdown, we want to wait on * outstanding writes to complete so they don't see spurious errors due * to shutting down the allocator: * * If we are doing an emergency shutdown outstanding writes may * hang until we shutdown the allocator so we don't want to wait * on outstanding writes before shutting everything down - but * we do need to wait on them before returning and signalling * that going RO is complete: */ wait_event(bch2_read_only_wait, test_bit(BCH_FS_write_disable_complete, &c->flags) || test_bit(BCH_FS_emergency_ro, &c->flags)); bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); if (writes_disabled) bch_verbose(c, "finished waiting for writes to stop"); __bch2_fs_read_only(c); wait_event(bch2_read_only_wait, test_bit(BCH_FS_write_disable_complete, &c->flags)); if (!writes_disabled) bch_verbose(c, "finished waiting for writes to stop"); clear_bit(BCH_FS_write_disable_complete, &c->flags); clear_bit(BCH_FS_going_ro, &c->flags); clear_bit(BCH_FS_rw, &c->flags); if (!bch2_journal_error(&c->journal) && !test_bit(BCH_FS_error, &c->flags) && !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch2_verify_accounting_clean(c); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); } else { bch_verbose(c, "done going read-only, filesystem not clean"); } } static void bch2_fs_read_only_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, read_only_work); down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); } static void bch2_fs_read_only_async(struct bch_fs *c) { queue_work(system_long_wq, &c->read_only_work); } bool bch2_fs_emergency_read_only(struct bch_fs *c) { bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); wake_up(&bch2_read_only_wait); return ret; } static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; /* * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. * * Ideally we'd start copygc/rebalance earlier instead of waiting for * all of recovery/fsck to complete: */ ret = bch2_copygc_start(c); if (ret) { bch_err(c, "error starting copygc thread"); return ret; } ret = bch2_rebalance_start(c); if (ret) { bch_err(c, "error starting rebalance thread"); return ret; } return 0; } static int __bch2_fs_read_write(struct bch_fs *c, bool early) { int ret; if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; } if (test_bit(BCH_FS_rw, &c->flags)) return 0; bch_info(c, "going read-write"); ret = bch2_sb_members_v2_init(c); if (ret) goto err; ret = bch2_fs_mark_dirty(c); if (ret) goto err; clear_bit(BCH_FS_clean_shutdown, &c->flags); /* * First journal write must be a flush write: after a clean shutdown we * don't read the journal, so the first journal write may end up * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ set_bit(JOURNAL_need_flush_write, &c->journal.flags); set_bit(JOURNAL_running, &c->journal.flags); for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); #else for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { BUG_ON(atomic_long_read(&c->writes[i])); atomic_long_inc(&c->writes[i]); } #endif ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err; if (!early) { ret = bch2_fs_read_write_late(c); if (ret) goto err; } bch2_do_discards(c); bch2_do_invalidates(c); bch2_do_stripe_deletes(c); bch2_do_pending_node_rewrites(c); return 0; err: if (test_bit(BCH_FS_rw, &c->flags)) bch2_fs_read_only(c); else __bch2_fs_read_only(c); return ret; } int bch2_fs_read_write(struct bch_fs *c) { if (c->opts.recovery_pass_last && c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) return -BCH_ERR_erofs_norecovery; if (c->opts.nochanges) return -BCH_ERR_erofs_nochanges; return __bch2_fs_read_write(c, false); } int bch2_fs_read_write_early(struct bch_fs *c) { lockdep_assert_held(&c->state_lock); return __bch2_fs_read_write(c, true); } /* Filesystem startup/shutdown: */ static void __bch2_fs_free(struct bch_fs *c) { for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_fs_accounting_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); bch2_fs_vfs_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_nocow_locking_exit(c); bch2_fs_io_write_exit(c); bch2_fs_io_read_exit(c); bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); bch2_fs_btree_cache_exit(c); bch2_fs_btree_iter_exit(c); bch2_fs_replicas_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) { u64 v = percpu_u64_get(c->online_reserved); WARN(v, "online_reserved not 0 at shutdown: %lli", v); free_percpu(c->online_reserved); } darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); free_percpu(c->usage); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_exit(&c->writes); #endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); kfree(c->unused_inode_hints); if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); if (c->btree_write_submit_wq) destroy_workqueue(c->btree_write_submit_wq); if (c->btree_read_complete_wq) destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) destroy_workqueue(c->btree_io_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); kvfree(c); module_put(THIS_MODULE); } static void bch2_fs_release(struct kobject *kobj) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); __bch2_fs_free(c); } void __bch2_fs_stop(struct bch_fs *c) { bch_verbose(c, "shutting down"); set_bit(BCH_FS_stopping, &c->flags); down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); for_each_member_device(c, ca) bch2_dev_unlink(ca); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); bch2_ro_ref_put(c); wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); kobject_put(&c->internal); /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); for_each_member_device(c, ca) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); } void bch2_fs_free(struct bch_fs *c) { unsigned i; mutex_lock(&bch_fs_list_lock); list_del(&c->list); mutex_unlock(&bch_fs_list_lock); closure_sync(&c->cl); closure_debug_destroy(&c->cl); for (i = 0; i < c->sb.nr_devices; i++) { struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { EBUG_ON(atomic_long_read(&ca->ref) != 1); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } } bch_verbose(c, "shutdown complete"); kobject_put(&c->kobj); } void bch2_fs_stop(struct bch_fs *c) { __bch2_fs_stop(c); bch2_fs_free(c); } static int bch2_fs_online(struct bch_fs *c) { int ret = 0; lockdep_assert_held(&bch_fs_list_lock); if (__bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); return -EINVAL; } ret = bch2_fs_chardev_init(c); if (ret) { bch_err(c, "error creating character device"); return ret; } bch2_fs_debug_init(c); ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: #endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { bch_err(c, "error creating sysfs objects"); return ret; } down_write(&c->state_lock); for_each_member_device(c, ca) { ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); bch2_dev_put(ca); goto err; } } BUG_ON(!list_empty(&c->list)); list_add(&c->list, &bch_fs_list); err: up_write(&c->state_lock); return ret; } static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_fs *c; struct printbuf name = PRINTBUF; unsigned i, iter_size; int ret = 0; c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; } c->stdio = (void *)(unsigned long) opts.stdio; __module_get(THIS_MODULE); closure_init(&c->cl, NULL); c->kobj.kset = bcachefs_kset; kobject_init(&c->kobj, &bch2_fs_ktype); kobject_init(&c->internal, &bch2_fs_internal_ktype); kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); c->minor = -1; c->disk_sb.fs_sb = true; init_rwsem(&c->state_lock); mutex_init(&c->sb_lock); mutex_init(&c->replicas_gc_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); sema_init(&c->online_fsck_mutex, 1); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); atomic_set(&c->journal_keys.ref, 1); c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); bch2_fs_gc_init(c); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); bch2_fs_quota_init(c); bch2_fs_ec_init_early(c); bch2_fs_move_init(c); bch2_fs_sb_errors_init_early(c); INIT_LIST_HEAD(&c->list); mutex_init(&c->bio_bounce_pages_lock); mutex_init(&c->snapshot_table_lock); init_rwsem(&c->snapshot_create_lock); spin_lock_init(&c->btree_write_error_lock); INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_error_msgs); mutex_init(&c->fsck_error_msgs_lock); seqcount_init(&c->usage_lock); sema_init(&c->io_in_flight, 128); INIT_LIST_HEAD(&c->vfs_inodes_list); mutex_init(&c->vfs_inodes_lock); c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); mutex_init(&c->sectors_available_lock); ret = percpu_init_rwsem(&c->mark_lock); if (ret) goto err; mutex_lock(&c->sb_lock); ret = bch2_sb_to_fs(c, sb); mutex_unlock(&c->sb_lock); if (ret) goto err; pr_uuid(&name, c->sb.user_uuid.b); ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; strscpy(c->name, name.buf, sizeof(c->name)); printbuf_exit(&name); /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); c->opts = bch2_opts_default; ret = bch2_opts_from_sb(&c->opts, sb); if (ret) goto err; bch2_opts_apply(&c->opts, opts); c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); if (bch2_fs_init_fault("fs_alloc")) { bch_err(c, "fs_alloc fault injected"); ret = -EFAULT; goto err; } iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || #endif mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; } ret = bch2_fs_counters_init(c) ?: bch2_fs_sb_errors_init(c) ?: bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: bch2_fs_io_read_init(c) ?: bch2_fs_io_write_init(c) ?: bch2_fs_nocow_locking_init(c) ?: bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: bch2_fs_vfs_init(c) ?: bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_direct_init(c); if (ret) goto err; for (i = 0; i < c->sb.nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; ret = bch2_dev_alloc(c, i); if (ret) goto err; } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); bch2_journal_entry_res_resize(&c->journal, &c->clock_journal_res, (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); mutex_lock(&bch_fs_list_lock); ret = bch2_fs_online(c); mutex_unlock(&bch_fs_list_lock); if (ret) goto err; out: return c; err: bch2_fs_free(c); c = ERR_PTR(ret); goto out; } noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; struct printbuf p = PRINTBUF; bool first = true; prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); if (c->opts.read_only) { prt_str(&p, " opts="); first = false; prt_printf(&p, "ro"); } for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); if (!(opt->flags & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; prt_str(&p, first ? " opts=" : ","); first = false; bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } bch_info(c, "%s", p.buf); printbuf_exit(&p); } int bch2_fs_start(struct bch_fs *c) { time64_t now = ktime_get_real_seconds(); int ret; print_mount_opts(c); down_write(&c->state_lock); BUG_ON(test_bit(BCH_FS_started, &c->flags)); mutex_lock(&c->sb_lock); ret = bch2_sb_members_v2_init(c); if (ret) { mutex_unlock(&c->sb_lock); goto err; } for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); struct bch_sb_field_ext *ext = bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); if (!ext) { bch_err(c, "insufficient space in superblock for sb_field_ext"); ret = -BCH_ERR_ENOSPC_sb; goto err; } for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); if (ret) goto err; ret = bch2_opts_check_may_set(c); if (ret) goto err; if (bch2_fs_init_fault("fs_start")) { bch_err(c, "fs_start fault injected"); ret = -EINVAL; goto err; } set_bit(BCH_FS_started, &c->flags); if (c->opts.read_only) { bch2_fs_read_only(c); } else { ret = !test_bit(BCH_FS_rw, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); if (ret) goto err; } ret = 0; err: if (ret) bch_err_msg(c, ret, "starting filesystem"); else bch_verbose(c, "done starting filesystem"); up_write(&c->state_lock); return ret; } static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) { struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); if (le16_to_cpu(sb->block_size) != block_sectors(c)) return -BCH_ERR_mismatched_block_size; if (le16_to_cpu(m.bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) return -BCH_ERR_bucket_size_too_small; return 0; } static int bch2_dev_in_fs(struct bch_sb_handle *fs, struct bch_sb_handle *sb, struct bch_opts *opts) { if (fs == sb) return 0; if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; if (fs->sb->block_size != sb->sb->block_size) return -BCH_ERR_mismatched_block_size; if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) return 0; if (fs->sb->seq == sb->sb->seq && fs->sb->write_time != sb->sb->write_time) { struct printbuf buf = PRINTBUF; prt_str(&buf, "Split brain detected between "); prt_bdevname(&buf, sb->bdev); prt_str(&buf, " and "); prt_bdevname(&buf, fs->bdev); prt_char(&buf, ':'); prt_newline(&buf); prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); prt_newline(&buf); prt_bdevname(&buf, fs->bdev); prt_char(&buf, ' '); bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; prt_newline(&buf); prt_bdevname(&buf, sb->bdev); prt_char(&buf, ' '); bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; prt_newline(&buf); if (!opts->no_splitbrain_check) prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); printbuf_exit(&buf); if (!opts->no_splitbrain_check) return -BCH_ERR_device_splitbrain; } struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); u64 seq_from_fs = le64_to_cpu(m.seq); u64 seq_from_member = le64_to_cpu(sb->sb->seq); if (seq_from_fs && seq_from_fs < seq_from_member) { struct printbuf buf = PRINTBUF; prt_str(&buf, "Split brain detected between "); prt_bdevname(&buf, sb->bdev); prt_str(&buf, " and "); prt_bdevname(&buf, fs->bdev); prt_char(&buf, ':'); prt_newline(&buf); prt_bdevname(&buf, fs->bdev); prt_str(&buf, " believes seq of "); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " has %llu\n", seq_from_member); if (!opts->no_splitbrain_check) { prt_str(&buf, "Not using "); prt_bdevname(&buf, sb->bdev); } pr_err("%s", buf.buf); printbuf_exit(&buf); if (!opts->no_splitbrain_check) return -BCH_ERR_device_splitbrain; } return 0; } /* Device startup/shutdown: */ static void bch2_dev_release(struct kobject *kobj) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); kfree(ca); } static void bch2_dev_free(struct bch_dev *ca) { cancel_work_sync(&ca->io_error_work); bch2_dev_unlink(ca); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif kobject_put(&ca->kobj); } static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); if (percpu_ref_is_zero(&ca->io_ref)) return; __bch2_dev_read_only(c, ca); reinit_completion(&ca->io_ref_completion); percpu_ref_kill(&ca->io_ref); wait_for_completion(&ca->io_ref_completion); bch2_dev_unlink(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); } #ifndef CONFIG_BCACHEFS_DEBUG static void bch2_dev_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, ref); complete(&ca->ref_completion); } #endif static void bch2_dev_io_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); complete(&ca->io_ref_completion); } static void bch2_dev_unlink(struct bch_dev *ca) { struct kobject *b; /* * This is racy w.r.t. the underlying block device being hot-removed, * which removes it from sysfs. * * It'd be lovely if we had a way to handle this race, but the sysfs * code doesn't appear to provide a good method and block/holder.c is * susceptible as well: */ if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev && (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { sysfs_remove_link(b, "bcachefs"); sysfs_remove_link(&ca->kobj, "block"); } } static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { int ret; if (!c->kobj.state_in_sysfs) return 0; if (!ca->kobj.state_in_sysfs) { ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx); if (ret) return ret; } if (ca->disk_sb.bdev) { struct kobject *block = bdev_kobj(ca->disk_sb.bdev); ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); if (ret) return ret; ret = sysfs_create_link(&ca->kobj, block, "block"); if (ret) return ret; } return 0; } static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, struct bch_member *member) { struct bch_dev *ca; unsigned i; ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) return NULL; kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); init_completion(&ca->io_ref_completion); init_rwsem(&ca->bucket_lock); INIT_WORK(&ca->io_error_work, bch2_io_error_work); bch2_time_stats_quantiles_init(&ca->io_latency[READ]); bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); for (i = 0; i < ARRAY_SIZE(member->errors); i++) atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); ca->uuid = member->uuid; ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / btree_sectors(c)); #ifndef CONFIG_BCACHEFS_DEBUG if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) goto err; #else atomic_long_set(&ca->ref, 1); #endif bch2_dev_allocator_background_init(ca); if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; return ca; err: bch2_dev_free(ca); return NULL; } static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) { ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); if (bch2_dev_sysfs_online(c, ca)) pr_warn("error creating sysfs objects"); } static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); struct bch_dev *ca = NULL; int ret = 0; if (bch2_fs_init_fault("dev_alloc")) goto err; ca = __bch2_dev_alloc(c, &member); if (!ca) goto err; ca->fs = c; bch2_dev_attach(c, ca, dev_idx); return ret; err: if (ca) bch2_dev_free(ca); return -BCH_ERR_ENOMEM_dev_alloc; } static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) { unsigned ret; if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", sb->sb->dev_idx); return -BCH_ERR_device_already_online; } if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { bch_err(ca, "cannot online: device too small"); return -BCH_ERR_device_size_too_small; } BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; /* Commit: */ ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); ca->dev = ca->disk_sb.bdev->bd_dev; percpu_ref_reinit(&ca->io_ref); return 0; } static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) { struct bch_dev *ca; int ret; lockdep_assert_held(&c->state_lock); if (le64_to_cpu(sb->sb->seq) > le64_to_cpu(c->disk_sb.sb->seq)) bch2_sb_to_fs(c, sb->sb); BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); ca = bch2_dev_locked(c, sb->sb->dev_idx); ret = __bch2_dev_attach_bdev(ca, sb); if (ret) return ret; bch2_dev_sysfs_online(c, ca); struct printbuf name = PRINTBUF; prt_bdevname(&name, ca->disk_sb.bdev); if (c->sb.nr_devices == 1) strscpy(c->name, name.buf, sizeof(c->name)); strscpy(ca->name, name.buf, sizeof(ca->name)); printbuf_exit(&name); rebalance_wakeup(c); return 0; } /* Device management: */ /* * Note: this function is also used by the error paths - when a particular * device sees an error, we call it to determine whether we can just set the * device RO, or - if this function returns false - we'll set the whole * filesystem RO: * * XXX: maybe we should be more explicit about whether we're changing state * because we got an error or what have you? */ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_devs_mask new_online_devs; int nr_rw = 0, required; lockdep_assert_held(&c->state_lock); switch (new_state) { case BCH_MEMBER_STATE_rw: return true; case BCH_MEMBER_STATE_ro: if (ca->mi.state != BCH_MEMBER_STATE_rw) return true; /* do we have enough devices to write to? */ for_each_member_device(c, ca2) if (ca2 != ca) nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas : metadata_replicas_required(c), !(flags & BCH_FORCE_IF_DATA_DEGRADED) ? c->opts.data_replicas : data_replicas_required(c)); return nr_rw >= required; case BCH_MEMBER_STATE_failed: case BCH_MEMBER_STATE_spare: if (ca->mi.state != BCH_MEMBER_STATE_rw && ca->mi.state != BCH_MEMBER_STATE_ro) return true; /* do we have enough devices to read from? */ new_online_devs = bch2_online_devs(c); __clear_bit(ca->dev_idx, new_online_devs.d); return bch2_have_enough_devs(c, new_online_devs, flags, false); default: BUG(); } } static bool bch2_fs_may_start(struct bch_fs *c) { struct bch_dev *ca; unsigned i, flags = 0; if (c->opts.very_degraded) flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; if (c->opts.degraded) flags |= BCH_FORCE_IF_DEGRADED; if (!c->opts.degraded && !c->opts.very_degraded) { mutex_lock(&c->sb_lock); for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; ca = bch2_dev_locked(c, i); if (!bch2_dev_is_online(ca) && (ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro)) { mutex_unlock(&c->sb_lock); return false; } } mutex_unlock(&c->sb_lock); } return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { /* * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_remove(c, ca); bch2_recalc_capacity(c); bch2_dev_journal_stop(&c->journal, ca); } static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_member *m; int ret = 0; if (ca->mi.state == new_state) return 0; if (!bch2_dev_state_allowed(c, ca, new_state, flags)) return -BCH_ERR_device_state_not_allowed; if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); bch_notice(ca, "%s", bch2_member_states[new_state]); mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_STATE(m, new_state); bch2_write_super(c); mutex_unlock(&c->sb_lock); if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); rebalance_wakeup(c); return ret; } int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { int ret; down_write(&c->state_lock); ret = __bch2_dev_set_state(c, ca, new_state, flags); up_write(&c->state_lock); return ret; } /* Device add/removal: */ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; int ret; down_write(&c->state_lock); /* * We consume a reference to ca->ref, regardless of whether we succeed * or fail: */ bch2_dev_put(ca); if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); ret = -BCH_ERR_device_state_not_allowed; goto err; } __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; ret = bch2_dev_remove_alloc(c, ca); bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) goto err; /* * We need to flush the entire journal to get rid of keys that reference * the device being removed before removing the superblock entry */ bch2_journal_flush_all_pins(&c->journal); /* * this is really just needed for the bch2_replicas_gc_(start|end) * calls, and could be cleaned up: */ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) goto err; ret = bch2_journal_flush(&c->journal); bch_err_msg(ca, ret, "bch2_journal_flush()"); if (ret) goto err; ret = bch2_replicas_gc2(c); bch_err_msg(ca, ret, "bch2_replicas_gc2()"); if (ret) goto err; data = bch2_dev_has_data(c, ca); if (data) { struct printbuf data_has = PRINTBUF; prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); printbuf_exit(&data_has); ret = -EBUSY; goto err; } __bch2_dev_offline(c, ca); mutex_lock(&c->sb_lock); rcu_assign_pointer(c->devs[ca->dev_idx], NULL); mutex_unlock(&c->sb_lock); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_kill(&ca->ref); #else ca->dying = true; bch2_dev_put(ca); #endif wait_for_completion(&ca->ref_completion); bch2_dev_free(ca); /* * Free this device's slot in the bch_member array - all pointers to * this device must be gone: */ mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); memset(&m->uuid, 0, sizeof(m->uuid)); bch2_write_super(c); mutex_unlock(&c->sb_lock); up_write(&c->state_lock); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_rw && !percpu_ref_is_zero(&ca->io_ref)) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; } /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; struct bch_dev *ca = NULL; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; int ret; ret = bch2_read_super(path, &opts, &sb); bch_err_msg(c, ret, "reading super"); if (ret) goto err; struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); if (label.allocation_failure) { ret = -ENOMEM; goto err; } } ret = bch2_dev_may_add(sb.sb, c); if (ret) goto err; ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { ret = -ENOMEM; goto err; } ret = __bch2_dev_attach_bdev(ca, &sb); if (ret) goto err; ret = bch2_dev_journal_alloc(ca, true); bch_err_msg(c, ret, "allocating journal"); if (ret) goto err; down_write(&c->state_lock); mutex_lock(&c->sb_lock); ret = bch2_sb_from_fs(c, ca); bch_err_msg(c, ret, "setting up new superblock"); if (ret) goto err_unlock; if (dynamic_fault("bcachefs:add:no_slot")) goto err_unlock; ret = bch2_sb_member_alloc(c); if (ret < 0) { bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } unsigned dev_idx = ret; /* success: */ dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { ret = __bch2_dev_group_set(c, ca, label.buf); bch_err_msg(c, ret, "creating new label"); if (ret) goto err_unlock; } bch2_write_super(c); mutex_unlock(&c->sb_lock); ret = bch2_dev_usage_init(ca, false); if (ret) goto err_late; ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(ca, ret, "marking new superblock"); if (ret) goto err_late; ret = bch2_fs_freespace_init(c); bch_err_msg(ca, ret, "initializing free space"); if (ret) goto err_late; ca->new_fs_bucket_idx = 0; if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return 0; err_unlock: mutex_unlock(&c->sb_lock); up_write(&c->state_lock); err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); printbuf_exit(&label); printbuf_exit(&errbuf); bch_err_fn(c, ret); return ret; err_late: up_write(&c->state_lock); ca = NULL; goto err; } /* Hot add existing device to running filesystem: */ int bch2_dev_online(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; struct bch_dev *ca; unsigned dev_idx; int ret; down_write(&c->state_lock); ret = bch2_read_super(path, &opts, &sb); if (ret) { up_write(&c->state_lock); return ret; } dev_idx = sb.sb->dev_idx; ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; ret = bch2_dev_attach_bdev(c, &sb); if (ret) goto err; ca = bch2_dev_locked(c, dev_idx); ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); if (ret) goto err; if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); if (!ca->mi.freespace_initialized) { ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); bch_err_msg(ca, ret, "initializing free space"); if (ret) goto err; } if (!ca->journal.nr) { ret = bch2_dev_journal_alloc(ca, false); bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err; } mutex_lock(&c->sb_lock); bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); up_write(&c->state_lock); return 0; err: up_write(&c->state_lock); bch2_free_super(&sb); return ret; } int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) { down_write(&c->state_lock); if (!bch2_dev_is_online(ca)) { bch_err(ca, "Already offline"); up_write(&c->state_lock); return 0; } if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); up_write(&c->state_lock); return -BCH_ERR_device_state_not_allowed; } __bch2_dev_offline(c, ca); up_write(&c->state_lock); return 0; } int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bch_member *m; u64 old_nbuckets; int ret = 0; down_write(&c->state_lock); old_nbuckets = ca->mi.nbuckets; if (nbuckets < ca->mi.nbuckets) { bch_err(ca, "Cannot shrink yet"); ret = -EINVAL; goto err; } if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { bch_err(ca, "New device size too big (%llu greater than max %u)", nbuckets, BCH_MEMBER_NBUCKETS_MAX); ret = -BCH_ERR_device_size_too_big; goto err; } if (bch2_dev_is_online(ca) && get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { bch_err(ca, "New size larger than device"); ret = -BCH_ERR_device_size_too_small; goto err; } ret = bch2_dev_buckets_resize(c, ca, nbuckets); bch_err_msg(ca, ret, "resizing buckets"); if (ret) goto err; ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) goto err; mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); m->nbuckets = cpu_to_le64(nbuckets); bch2_write_super(c); mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_dev_data_type, .dev_data_type.dev = ca->dev_idx, .dev_data_type.data_type = BCH_DATA_free, }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; } bch2_recalc_capacity(c); err: up_write(&c->state_lock); return ret; } /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { if (!strncmp(name, "/dev/", strlen("/dev/"))) name += strlen("/dev/"); for_each_member_device(c, ca) if (!strcmp(name, ca->name)) return ca; return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } /* Filesystem open: */ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) { return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); } struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; if (!try_module_get(THIS_MODULE)) return ERR_PTR(-ENODEV); if (!nr_devices) { ret = -EINVAL; goto err; } ret = darray_make_room(&sbs, nr_devices); if (ret) goto err; for (unsigned i = 0; i < nr_devices; i++) { struct bch_sb_handle sb = { NULL }; ret = bch2_read_super(devices[i], &opts, &sb); if (ret) goto err; BUG_ON(darray_push(&sbs, sb)); } if (opts.nochanges && !opts.read_only) { ret = -BCH_ERR_erofs_nochanges; goto err_print; } darray_for_each(sbs, sb) if (!best || sb_cmp(sb->sb, best->sb) > 0) best = sb; darray_for_each_reverse(sbs, sb) { ret = bch2_dev_in_fs(best, sb, &opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { bch2_free_super(sb); darray_remove_item(&sbs, sb); best -= best > sb; ret = 0; continue; } if (ret) goto err_print; } c = bch2_fs_alloc(best->sb, opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; down_write(&c->state_lock); darray_for_each(sbs, sb) { ret = bch2_dev_attach_bdev(c, sb); if (ret) { up_write(&c->state_lock); goto err; } } up_write(&c->state_lock); if (!bch2_fs_may_start(c)) { ret = -BCH_ERR_insufficient_devices_to_start; goto err_print; } if (!c->opts.nostart) { ret = bch2_fs_start(c); if (ret) goto err; } out: darray_for_each(sbs, sb) bch2_free_super(sb); darray_exit(&sbs); printbuf_exit(&errbuf); module_put(THIS_MODULE); return c; err_print: pr_err("bch_fs_open err opening %s: %s", devices[0], bch2_err_str(ret)); err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); c = ERR_PTR(ret); goto out; } /* Global interfaces/init */ static void bcachefs_exit(void) { bch2_debug_exit(); bch2_vfs_exit(); bch2_chardev_exit(); bch2_btree_key_cache_exit(); if (bcachefs_kset) kset_unregister(bcachefs_kset); } static int __init bcachefs_init(void) { bch2_bkey_pack_test(); if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || bch2_btree_key_cache_init() || bch2_chardev_init() || bch2_vfs_init() || bch2_debug_init()) goto err; return 0; err: bcachefs_exit(); return -ENOMEM; } #define BCH_DEBUG_PARAM(name, description) \ bool bch2_##name; \ module_param_named(name, bch2_##name, bool, 0644); \ MODULE_PARM_DESC(name, description); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM __maybe_unused static unsigned bch2_metadata_version = bcachefs_metadata_version_current; module_param_named(version, bch2_metadata_version, uint, 0400); module_exit(bcachefs_exit); module_init(bcachefs_init); |
24 11 1 1 1 19 18 1 19 1 1 4 4 14 1 1 1 1 1 1 1 1 1 1 1 1 13 2 12 1 1 11 11 11 11 11 11 11 13 13 11 11 11 11 18 19 19 19 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 | /* * Copyright (c) 2003 Patrick McHardy, <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * 2003-10-17 - Ported from altq */ /* * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. * * Permission to use, copy, modify, and distribute this software and * its documentation is hereby granted (including for commercial or * for-profit use), provided that both the copyright notice and this * permission notice appear in all copies of the software, derivative * works, or modified versions, and any portions thereof. * * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * Carnegie Mellon encourages (but does not require) users of this * software to return any improvements or extensions that they make, * and to grant Carnegie Mellon the rights to redistribute these * changes without encumbrance. */ /* * H-FSC is described in Proceedings of SIGCOMM'97, * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, * Real-Time and Priority Service" * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. * * Oleg Cherevko <olwi@aq.ml.com.ua> added the upperlimit for link-sharing. * when a class has an upperlimit, the fit-time is computed from the * upperlimit service curve. the link-sharing scheduler does not schedule * a class whose fit-time exceeds the current time. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <asm/div64.h> /* * kernel internal service curve representation: * coordinates are given by 64 bit unsigned integers. * x-axis: unit is clock count. * y-axis: unit is byte. * * The service curve parameters are converted to the internal * representation. The slope values are scaled to avoid overflow. * the inverse slope values as well as the y-projection of the 1st * segment are kept in order to avoid 64-bit divide operations * that are expensive on 32-bit architectures. */ struct internal_sc { u64 sm1; /* scaled slope of the 1st segment */ u64 ism1; /* scaled inverse-slope of the 1st segment */ u64 dx; /* the x-projection of the 1st segment */ u64 dy; /* the y-projection of the 1st segment */ u64 sm2; /* scaled slope of the 2nd segment */ u64 ism2; /* scaled inverse-slope of the 2nd segment */ }; /* runtime service curve */ struct runtime_sc { u64 x; /* current starting position on x-axis */ u64 y; /* current starting position on y-axis */ u64 sm1; /* scaled slope of the 1st segment */ u64 ism1; /* scaled inverse-slope of the 1st segment */ u64 dx; /* the x-projection of the 1st segment */ u64 dy; /* the y-projection of the 1st segment */ u64 sm2; /* scaled slope of the 2nd segment */ u64 ism2; /* scaled inverse-slope of the 2nd segment */ }; enum hfsc_class_flags { HFSC_RSC = 0x1, HFSC_FSC = 0x2, HFSC_USC = 0x4 }; struct hfsc_class { struct Qdisc_class_common cl_common; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ struct tcf_block *block; unsigned int level; /* class level in hierarchy */ struct hfsc_sched *sched; /* scheduler data */ struct hfsc_class *cl_parent; /* parent class */ struct list_head siblings; /* sibling classes */ struct list_head children; /* child classes */ struct Qdisc *qdisc; /* leaf qdisc */ struct rb_node el_node; /* qdisc's eligible tree member */ struct rb_root vt_tree; /* active children sorted by cl_vt */ struct rb_node vt_node; /* parent's vt_tree member */ struct rb_root cf_tree; /* active children sorted by cl_f */ struct rb_node cf_node; /* parent's cf_heap member */ u64 cl_total; /* total work in bytes */ u64 cl_cumul; /* cumulative work in bytes done by real-time criteria */ u64 cl_d; /* deadline*/ u64 cl_e; /* eligible time */ u64 cl_vt; /* virtual time */ u64 cl_f; /* time when this class will fit for link-sharing, max(myf, cfmin) */ u64 cl_myf; /* my fit-time (calculated from this class's own upperlimit curve) */ u64 cl_cfmin; /* earliest children's fit-time (used with cl_myf to obtain cl_f) */ u64 cl_cvtmin; /* minimal virtual time among the children fit for link-sharing (monotonic within a period) */ u64 cl_vtadj; /* intra-period cumulative vt adjustment */ u64 cl_cvtoff; /* largest virtual time seen among the children */ struct internal_sc cl_rsc; /* internal real-time service curve */ struct internal_sc cl_fsc; /* internal fair service curve */ struct internal_sc cl_usc; /* internal upperlimit service curve */ struct runtime_sc cl_deadline; /* deadline curve */ struct runtime_sc cl_eligible; /* eligible curve */ struct runtime_sc cl_virtual; /* virtual curve */ struct runtime_sc cl_ulimit; /* upperlimit curve */ u8 cl_flags; /* which curves are valid */ u32 cl_vtperiod; /* vt period sequence number */ u32 cl_parentperiod;/* parent's vt period sequence number*/ u32 cl_nactive; /* number of active children */ }; struct hfsc_sched { u16 defcls; /* default class id */ struct hfsc_class root; /* root class */ struct Qdisc_class_hash clhash; /* class hash */ struct rb_root eligible; /* eligible tree */ struct qdisc_watchdog watchdog; /* watchdog timer */ }; #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ /* * eligible tree holds backlogged classes being sorted by their eligible times. * there is one eligible tree per hfsc instance. */ static void eltree_insert(struct hfsc_class *cl) { struct rb_node **p = &cl->sched->eligible.rb_node; struct rb_node *parent = NULL; struct hfsc_class *cl1; while (*p != NULL) { parent = *p; cl1 = rb_entry(parent, struct hfsc_class, el_node); if (cl->cl_e >= cl1->cl_e) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&cl->el_node, parent, p); rb_insert_color(&cl->el_node, &cl->sched->eligible); } static inline void eltree_remove(struct hfsc_class *cl) { rb_erase(&cl->el_node, &cl->sched->eligible); } static inline void eltree_update(struct hfsc_class *cl) { eltree_remove(cl); eltree_insert(cl); } /* find the class with the minimum deadline among the eligible classes */ static inline struct hfsc_class * eltree_get_mindl(struct hfsc_sched *q, u64 cur_time) { struct hfsc_class *p, *cl = NULL; struct rb_node *n; for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) { p = rb_entry(n, struct hfsc_class, el_node); if (p->cl_e > cur_time) break; if (cl == NULL || p->cl_d < cl->cl_d) cl = p; } return cl; } /* find the class with minimum eligible time among the eligible classes */ static inline struct hfsc_class * eltree_get_minel(struct hfsc_sched *q) { struct rb_node *n; n = rb_first(&q->eligible); if (n == NULL) return NULL; return rb_entry(n, struct hfsc_class, el_node); } /* * vttree holds holds backlogged child classes being sorted by their virtual * time. each intermediate class has one vttree. */ static void vttree_insert(struct hfsc_class *cl) { struct rb_node **p = &cl->cl_parent->vt_tree.rb_node; struct rb_node *parent = NULL; struct hfsc_class *cl1; while (*p != NULL) { parent = *p; cl1 = rb_entry(parent, struct hfsc_class, vt_node); if (cl->cl_vt >= cl1->cl_vt) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&cl->vt_node, parent, p); rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree); } static inline void vttree_remove(struct hfsc_class *cl) { rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree); } static inline void vttree_update(struct hfsc_class *cl) { vttree_remove(cl); vttree_insert(cl); } static inline struct hfsc_class * vttree_firstfit(struct hfsc_class *cl, u64 cur_time) { struct hfsc_class *p; struct rb_node *n; for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) { p = rb_entry(n, struct hfsc_class, vt_node); if (p->cl_f <= cur_time) return p; } return NULL; } /* * get the leaf class with the minimum vt in the hierarchy */ static struct hfsc_class * vttree_get_minvt(struct hfsc_class *cl, u64 cur_time) { /* if root-class's cfmin is bigger than cur_time nothing to do */ if (cl->cl_cfmin > cur_time) return NULL; while (cl->level > 0) { cl = vttree_firstfit(cl, cur_time); if (cl == NULL) return NULL; /* * update parent's cl_cvtmin. */ if (cl->cl_parent->cl_cvtmin < cl->cl_vt) cl->cl_parent->cl_cvtmin = cl->cl_vt; } return cl; } static void cftree_insert(struct hfsc_class *cl) { struct rb_node **p = &cl->cl_parent->cf_tree.rb_node; struct rb_node *parent = NULL; struct hfsc_class *cl1; while (*p != NULL) { parent = *p; cl1 = rb_entry(parent, struct hfsc_class, cf_node); if (cl->cl_f >= cl1->cl_f) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&cl->cf_node, parent, p); rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree); } static inline void cftree_remove(struct hfsc_class *cl) { rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree); } static inline void cftree_update(struct hfsc_class *cl) { cftree_remove(cl); cftree_insert(cl); } /* * service curve support functions * * external service curve parameters * m: bps * d: us * internal service curve parameters * sm: (bytes/psched_us) << SM_SHIFT * ism: (psched_us/byte) << ISM_SHIFT * dx: psched_us * * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us. * * sm and ism are scaled in order to keep effective digits. * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective * digits in decimal using the following table. * * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps * ------------+------------------------------------------------------- * bytes/1.024us 12.8e-3 128e-3 1280e-3 12800e-3 128000e-3 * * 1.024us/byte 78.125 7.8125 0.78125 0.078125 0.0078125 * * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18. */ #define SM_SHIFT (30 - PSCHED_SHIFT) #define ISM_SHIFT (8 + PSCHED_SHIFT) #define SM_MASK ((1ULL << SM_SHIFT) - 1) #define ISM_MASK ((1ULL << ISM_SHIFT) - 1) static inline u64 seg_x2y(u64 x, u64 sm) { u64 y; /* * compute * y = x * sm >> SM_SHIFT * but divide it for the upper and lower bits to avoid overflow */ y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); return y; } static inline u64 seg_y2x(u64 y, u64 ism) { u64 x; if (y == 0) x = 0; else if (ism == HT_INFINITY) x = HT_INFINITY; else { x = (y >> ISM_SHIFT) * ism + (((y & ISM_MASK) * ism) >> ISM_SHIFT); } return x; } /* Convert m (bps) into sm (bytes/psched us) */ static u64 m2sm(u32 m) { u64 sm; sm = ((u64)m << SM_SHIFT); sm += PSCHED_TICKS_PER_SEC - 1; do_div(sm, PSCHED_TICKS_PER_SEC); return sm; } /* convert m (bps) into ism (psched us/byte) */ static u64 m2ism(u32 m) { u64 ism; if (m == 0) ism = HT_INFINITY; else { ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT); ism += m - 1; do_div(ism, m); } return ism; } /* convert d (us) into dx (psched us) */ static u64 d2dx(u32 d) { u64 dx; dx = ((u64)d * PSCHED_TICKS_PER_SEC); dx += USEC_PER_SEC - 1; do_div(dx, USEC_PER_SEC); return dx; } /* convert sm (bytes/psched us) into m (bps) */ static u32 sm2m(u64 sm) { u64 m; m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT; return (u32)m; } /* convert dx (psched us) into d (us) */ static u32 dx2d(u64 dx) { u64 d; d = dx * USEC_PER_SEC; do_div(d, PSCHED_TICKS_PER_SEC); return (u32)d; } static void sc2isc(struct tc_service_curve *sc, struct internal_sc *isc) { isc->sm1 = m2sm(sc->m1); isc->ism1 = m2ism(sc->m1); isc->dx = d2dx(sc->d); isc->dy = seg_x2y(isc->dx, isc->sm1); isc->sm2 = m2sm(sc->m2); isc->ism2 = m2ism(sc->m2); } /* * initialize the runtime service curve with the given internal * service curve starting at (x, y). */ static void rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) { rtsc->x = x; rtsc->y = y; rtsc->sm1 = isc->sm1; rtsc->ism1 = isc->ism1; rtsc->dx = isc->dx; rtsc->dy = isc->dy; rtsc->sm2 = isc->sm2; rtsc->ism2 = isc->ism2; } /* * calculate the y-projection of the runtime service curve by the * given x-projection value */ static u64 rtsc_y2x(struct runtime_sc *rtsc, u64 y) { u64 x; if (y < rtsc->y) x = rtsc->x; else if (y <= rtsc->y + rtsc->dy) { /* x belongs to the 1st segment */ if (rtsc->dy == 0) x = rtsc->x + rtsc->dx; else x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); } else { /* x belongs to the 2nd segment */ x = rtsc->x + rtsc->dx + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); } return x; } static u64 rtsc_x2y(struct runtime_sc *rtsc, u64 x) { u64 y; if (x <= rtsc->x) y = rtsc->y; else if (x <= rtsc->x + rtsc->dx) /* y belongs to the 1st segment */ y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); else /* y belongs to the 2nd segment */ y = rtsc->y + rtsc->dy + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); return y; } /* * update the runtime service curve by taking the minimum of the current * runtime service curve and the service curve starting at (x, y). */ static void rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) { u64 y1, y2, dx, dy; u32 dsm; if (isc->sm1 <= isc->sm2) { /* service curve is convex */ y1 = rtsc_x2y(rtsc, x); if (y1 < y) /* the current rtsc is smaller */ return; rtsc->x = x; rtsc->y = y; return; } /* * service curve is concave * compute the two y values of the current rtsc * y1: at x * y2: at (x + dx) */ y1 = rtsc_x2y(rtsc, x); if (y1 <= y) { /* rtsc is below isc, no change to rtsc */ return; } y2 = rtsc_x2y(rtsc, x + isc->dx); if (y2 >= y + isc->dy) { /* rtsc is above isc, replace rtsc by isc */ rtsc->x = x; rtsc->y = y; rtsc->dx = isc->dx; rtsc->dy = isc->dy; return; } /* * the two curves intersect * compute the offsets (dx, dy) using the reverse * function of seg_x2y() * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) */ dx = (y1 - y) << SM_SHIFT; dsm = isc->sm1 - isc->sm2; do_div(dx, dsm); /* * check if (x, y1) belongs to the 1st segment of rtsc. * if so, add the offset. */ if (rtsc->x + rtsc->dx > x) dx += rtsc->x + rtsc->dx - x; dy = seg_x2y(dx, isc->sm1); rtsc->x = x; rtsc->y = y; rtsc->dx = dx; rtsc->dy = dy; } static void init_ed(struct hfsc_class *cl, unsigned int next_len) { u64 cur_time = psched_get_time(); /* update the deadline curve */ rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); /* * update the eligible curve. * for concave, it is equal to the deadline curve. * for convex, it is a linear curve with slope m2. */ cl->cl_eligible = cl->cl_deadline; if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { cl->cl_eligible.dx = 0; cl->cl_eligible.dy = 0; } /* compute e and d */ cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); eltree_insert(cl); } static void update_ed(struct hfsc_class *cl, unsigned int next_len) { cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); eltree_update(cl); } static inline void update_d(struct hfsc_class *cl, unsigned int next_len) { cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); } static inline void update_cfmin(struct hfsc_class *cl) { struct rb_node *n = rb_first(&cl->cf_tree); struct hfsc_class *p; if (n == NULL) { cl->cl_cfmin = 0; return; } p = rb_entry(n, struct hfsc_class, cf_node); cl->cl_cfmin = p->cl_f; } static void init_vf(struct hfsc_class *cl, unsigned int len) { struct hfsc_class *max_cl; struct rb_node *n; u64 vt, f, cur_time; int go_active; cur_time = 0; go_active = 1; for (; cl->cl_parent != NULL; cl = cl->cl_parent) { if (go_active && cl->cl_nactive++ == 0) go_active = 1; else go_active = 0; if (go_active) { n = rb_last(&cl->cl_parent->vt_tree); if (n != NULL) { max_cl = rb_entry(n, struct hfsc_class, vt_node); /* * set vt to the average of the min and max * classes. if the parent's period didn't * change, don't decrease vt of the class. */ vt = max_cl->cl_vt; if (cl->cl_parent->cl_cvtmin != 0) vt = (cl->cl_parent->cl_cvtmin + vt)/2; if (cl->cl_parent->cl_vtperiod != cl->cl_parentperiod || vt > cl->cl_vt) cl->cl_vt = vt; } else { /* * first child for a new parent backlog period. * initialize cl_vt to the highest value seen * among the siblings. this is analogous to * what cur_time would provide in realtime case. */ cl->cl_vt = cl->cl_parent->cl_cvtoff; cl->cl_parent->cl_cvtmin = 0; } /* update the virtual curve */ rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); cl->cl_vtadj = 0; cl->cl_vtperiod++; /* increment vt period */ cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; if (cl->cl_parent->cl_nactive == 0) cl->cl_parentperiod++; cl->cl_f = 0; vttree_insert(cl); cftree_insert(cl); if (cl->cl_flags & HFSC_USC) { /* class has upper limit curve */ if (cur_time == 0) cur_time = psched_get_time(); /* update the ulimit curve */ rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); /* compute myf */ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total); } } f = max(cl->cl_myf, cl->cl_cfmin); if (f != cl->cl_f) { cl->cl_f = f; cftree_update(cl); } update_cfmin(cl->cl_parent); } } static void update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) { u64 f; /* , myf_bound, delta; */ int go_passive = 0; if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC) go_passive = 1; for (; cl->cl_parent != NULL; cl = cl->cl_parent) { cl->cl_total += len; if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0) continue; if (go_passive && --cl->cl_nactive == 0) go_passive = 1; else go_passive = 0; /* update vt */ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj; /* * if vt of the class is smaller than cvtmin, * the class was skipped in the past due to non-fit. * if so, we need to adjust vtadj. */ if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; cl->cl_vt = cl->cl_parent->cl_cvtmin; } if (go_passive) { /* no more active child, going passive */ /* update cvtoff of the parent class */ if (cl->cl_vt > cl->cl_parent->cl_cvtoff) cl->cl_parent->cl_cvtoff = cl->cl_vt; /* remove this class from the vt tree */ vttree_remove(cl); cftree_remove(cl); update_cfmin(cl->cl_parent); continue; } /* update the vt tree */ vttree_update(cl); /* update f */ if (cl->cl_flags & HFSC_USC) { cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total); #if 0 cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); /* * This code causes classes to stay way under their * limit when multiple classes are used at gigabit * speed. needs investigation. -kaber */ /* * if myf lags behind by more than one clock tick * from the current time, adjust myfadj to prevent * a rate-limited class from going greedy. * in a steady state under rate-limiting, myf * fluctuates within one clock tick. */ myf_bound = cur_time - PSCHED_JIFFIE2US(1); if (cl->cl_myf < myf_bound) { delta = cur_time - cl->cl_myf; cl->cl_myfadj += delta; cl->cl_myf += delta; } #endif } f = max(cl->cl_myf, cl->cl_cfmin); if (f != cl->cl_f) { cl->cl_f = f; cftree_update(cl); update_cfmin(cl->cl_parent); } } } static unsigned int qdisc_peek_len(struct Qdisc *sch) { struct sk_buff *skb; unsigned int len; skb = sch->ops->peek(sch); if (unlikely(skb == NULL)) { qdisc_warn_nonwc("qdisc_peek_len", sch); return 0; } len = qdisc_pkt_len(skb); return len; } static void hfsc_adjust_levels(struct hfsc_class *cl) { struct hfsc_class *p; unsigned int level; do { level = 0; list_for_each_entry(p, &cl->children, siblings) { if (p->level >= level) level = p->level + 1; } cl->level = level; } while ((cl = cl->cl_parent) != NULL); } static inline struct hfsc_class * hfsc_find_class(u32 classid, struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct Qdisc_class_common *clc; clc = qdisc_class_find(&q->clhash, classid); if (clc == NULL) return NULL; return container_of(clc, struct hfsc_class, cl_common); } static void hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, u64 cur_time) { sc2isc(rsc, &cl->cl_rsc); rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); cl->cl_eligible = cl->cl_deadline; if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { cl->cl_eligible.dx = 0; cl->cl_eligible.dy = 0; } cl->cl_flags |= HFSC_RSC; } static void hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) { sc2isc(fsc, &cl->cl_fsc); rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); cl->cl_flags |= HFSC_FSC; } static void hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, u64 cur_time) { sc2isc(usc, &cl->cl_usc); rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); cl->cl_flags |= HFSC_USC; } static void hfsc_upgrade_rt(struct hfsc_class *cl) { cl->cl_fsc = cl->cl_rsc; rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); cl->cl_flags |= HFSC_FSC; } static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = { [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) }, [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) }, [TCA_HFSC_USC] = { .len = sizeof(struct tc_service_curve) }, }; static int hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)*arg; struct hfsc_class *parent = NULL; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_HFSC_MAX + 1]; struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; u64 cur_time; int err; if (opt == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_HFSC_MAX, opt, hfsc_policy, NULL); if (err < 0) return err; if (tb[TCA_HFSC_RSC]) { rsc = nla_data(tb[TCA_HFSC_RSC]); if (rsc->m1 == 0 && rsc->m2 == 0) rsc = NULL; } if (tb[TCA_HFSC_FSC]) { fsc = nla_data(tb[TCA_HFSC_FSC]); if (fsc->m1 == 0 && fsc->m2 == 0) fsc = NULL; } if (tb[TCA_HFSC_USC]) { usc = nla_data(tb[TCA_HFSC_USC]); if (usc->m1 == 0 && usc->m2 == 0) usc = NULL; } if (cl != NULL) { int old_flags; if (parentid) { if (cl->cl_parent && cl->cl_parent->cl_common.classid != parentid) return -EINVAL; if (cl->cl_parent == NULL && parentid != TC_H_ROOT) return -EINVAL; } cur_time = psched_get_time(); if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) return err; } sch_tree_lock(sch); old_flags = cl->cl_flags; if (rsc != NULL) hfsc_change_rsc(cl, rsc, cur_time); if (fsc != NULL) hfsc_change_fsc(cl, fsc); if (usc != NULL) hfsc_change_usc(cl, usc, cur_time); if (cl->qdisc->q.qlen != 0) { int len = qdisc_peek_len(cl->qdisc); if (cl->cl_flags & HFSC_RSC) { if (old_flags & HFSC_RSC) update_ed(cl, len); else init_ed(cl, len); } if (cl->cl_flags & HFSC_FSC) { if (old_flags & HFSC_FSC) update_vf(cl, 0, cur_time); else init_vf(cl, len); } } sch_tree_unlock(sch); return 0; } if (parentid == TC_H_ROOT) return -EEXIST; parent = &q->root; if (parentid) { parent = hfsc_find_class(parentid, sch); if (parent == NULL) return -ENOENT; } if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) return -EINVAL; if (hfsc_find_class(classid, sch)) return -EEXIST; if (rsc == NULL && fsc == NULL) return -EINVAL; cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); return err; } if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) { tcf_block_put(cl->block); kfree(cl); return err; } } if (rsc != NULL) hfsc_change_rsc(cl, rsc, 0); if (fsc != NULL) hfsc_change_fsc(cl, fsc); if (usc != NULL) hfsc_change_usc(cl, usc, 0); cl->cl_common.classid = classid; cl->sched = q; cl->cl_parent = parent; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; else qdisc_hash_add(cl->qdisc, true); INIT_LIST_HEAD(&cl->children); cl->vt_tree = RB_ROOT; cl->cf_tree = RB_ROOT; sch_tree_lock(sch); /* Check if the inner class is a misconfigured 'rt' */ if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) { NL_SET_ERR_MSG(extack, "Forced curve change on parent 'rt' to 'sc'"); hfsc_upgrade_rt(parent); } qdisc_class_hash_insert(&q->clhash, &cl->cl_common); list_add_tail(&cl->siblings, &parent->children); if (parent->level == 0) qdisc_purge_queue(parent->qdisc); hfsc_adjust_levels(parent); sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; } static void hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) { struct hfsc_sched *q = qdisc_priv(sch); tcf_block_put(cl->block); qdisc_put(cl->qdisc); gen_kill_estimator(&cl->rate_est); if (cl != &q->root) kfree(cl); } static int hfsc_delete_class(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; if (cl->level > 0 || qdisc_class_in_use(&cl->cl_common) || cl == &q->root) { NL_SET_ERR_MSG(extack, "HFSC class in use"); return -EBUSY; } sch_tree_lock(sch); list_del(&cl->siblings); hfsc_adjust_levels(cl->cl_parent); qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->cl_common); sch_tree_unlock(sch); hfsc_destroy_class(sch, cl); return 0; } static struct hfsc_class * hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *head, *cl; struct tcf_result res; struct tcf_proto *tcf; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 && (cl = hfsc_find_class(skb->priority, sch)) != NULL) if (cl->level == 0) return cl; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif cl = (struct hfsc_class *)res.class; if (!cl) { cl = hfsc_find_class(res.classid, sch); if (!cl) break; /* filter selected invalid classid */ if (cl->level >= head->level) break; /* filter may only point downwards */ } if (cl->level == 0) return cl; /* hit leaf class */ /* apply inner filter chain */ tcf = rcu_dereference_bh(cl->filter_list); head = cl; } /* classification failed, try default class */ cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), READ_ONCE(q->defcls)), sch); if (cl == NULL || cl->level > 0) return NULL; return cl; } static int hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct hfsc_class *cl = (struct hfsc_class *)arg; if (cl->level > 0) return -EINVAL; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->cl_common.classid, NULL); if (new == NULL) new = &noop_qdisc; } *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } static struct Qdisc * hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) { struct hfsc_class *cl = (struct hfsc_class *)arg; if (cl->level == 0) return cl->qdisc; return NULL; } static void hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg) { struct hfsc_class *cl = (struct hfsc_class *)arg; /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from vttree. */ update_vf(cl, 0, 0); if (cl->cl_flags & HFSC_RSC) eltree_remove(cl); } static unsigned long hfsc_search_class(struct Qdisc *sch, u32 classid) { return (unsigned long)hfsc_find_class(classid, sch); } static unsigned long hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { struct hfsc_class *p = (struct hfsc_class *)parent; struct hfsc_class *cl = hfsc_find_class(classid, sch); if (cl != NULL) { if (p != NULL && p->level <= cl->level) return 0; qdisc_class_get(&cl->cl_common); } return (unsigned long)cl; } static void hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct hfsc_class *cl = (struct hfsc_class *)arg; qdisc_class_put(&cl->cl_common); } static struct tcf_block *hfsc_tcf_block(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl = (struct hfsc_class *)arg; if (cl == NULL) cl = &q->root; return cl->block; } static int hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) { struct tc_service_curve tsc; tsc.m1 = sm2m(sc->sm1); tsc.d = dx2d(sc->dx); tsc.m2 = sm2m(sc->sm2); if (nla_put(skb, attr, sizeof(tsc), &tsc)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) { if ((cl->cl_flags & HFSC_RSC) && (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) goto nla_put_failure; if ((cl->cl_flags & HFSC_FSC) && (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) goto nla_put_failure; if ((cl->cl_flags & HFSC_USC) && (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) goto nla_put_failure; return skb->len; nla_put_failure: return -1; } static int hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct hfsc_class *cl = (struct hfsc_class *)arg; struct nlattr *nest; tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->cl_common.classid : TC_H_ROOT; tcm->tcm_handle = cl->cl_common.classid; if (cl->level == 0) tcm->tcm_info = cl->qdisc->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (hfsc_dump_curves(skb, cl) < 0) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct hfsc_class *cl = (struct hfsc_class *)arg; struct tc_hfsc_stats xstats; __u32 qlen; qdisc_qstats_qlen_backlog(cl->qdisc, &qlen, &cl->qstats.backlog); xstats.level = cl->level; xstats.period = cl->cl_vtperiod; xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; } } } static void hfsc_schedule_watchdog(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; u64 next_time = 0; cl = eltree_get_minel(q); if (cl) next_time = cl->cl_e; if (q->root.cl_cfmin != 0) { if (next_time == 0 || next_time > q->root.cl_cfmin) next_time = q->root.cl_cfmin; } if (next_time) qdisc_watchdog_schedule(&q->watchdog, next_time); } static int hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; int err; qdisc_watchdog_init(&q->watchdog, sch); if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); q->defcls = qopt->defcls; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; q->eligible = RB_ROOT; err = tcf_block_get(&q->root.block, &q->root.filter_list, sch, extack); if (err) return err; gnet_stats_basic_sync_init(&q->root.bstats); q->root.cl_common.classid = sch->handle; q->root.sched = q; q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, NULL); if (q->root.qdisc == NULL) q->root.qdisc = &noop_qdisc; else qdisc_hash_add(q->root.qdisc, true); INIT_LIST_HEAD(&q->root.children); q->root.vt_tree = RB_ROOT; q->root.cf_tree = RB_ROOT; qdisc_class_hash_insert(&q->clhash, &q->root.cl_common); qdisc_class_hash_grow(sch, &q->clhash); return 0; } static int hfsc_change_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hfsc_sched *q = qdisc_priv(sch); struct tc_hfsc_qopt *qopt; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); WRITE_ONCE(q->defcls, qopt->defcls); return 0; } static void hfsc_reset_class(struct hfsc_class *cl) { cl->cl_total = 0; cl->cl_cumul = 0; cl->cl_d = 0; cl->cl_e = 0; cl->cl_vt = 0; cl->cl_vtadj = 0; cl->cl_cvtmin = 0; cl->cl_cvtoff = 0; cl->cl_vtperiod = 0; cl->cl_parentperiod = 0; cl->cl_f = 0; cl->cl_myf = 0; cl->cl_cfmin = 0; cl->cl_nactive = 0; cl->vt_tree = RB_ROOT; cl->cf_tree = RB_ROOT; qdisc_reset(cl->qdisc); if (cl->cl_flags & HFSC_RSC) rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); if (cl->cl_flags & HFSC_FSC) rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); if (cl->cl_flags & HFSC_USC) rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); } static void hfsc_reset_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) hfsc_reset_class(cl); } q->eligible = RB_ROOT; qdisc_watchdog_cancel(&q->watchdog); } static void hfsc_destroy_qdisc(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hlist_node *next; struct hfsc_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { tcf_block_put(cl->block); cl->block = NULL; } } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], cl_common.hnode) hfsc_destroy_class(sch, cl); } qdisc_class_hash_destroy(&q->clhash); qdisc_watchdog_cancel(&q->watchdog); } static int hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) { struct hfsc_sched *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; qopt.defcls = READ_ONCE(q->defcls); if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct hfsc_class *cl; int err; bool first; cl = hfsc_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return err; } first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { cl->qstats.drops++; qdisc_qstats_drop(sch); } return err; } if (first) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) init_vf(cl, len); /* * If this is the first packet, isolate the head so an eventual * head drop before the first dequeue operation has no chance * to invalidate the deadline. */ if (cl->cl_flags & HFSC_RSC) cl->qdisc->ops->peek(cl->qdisc); } sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } static struct sk_buff * hfsc_dequeue(struct Qdisc *sch) { struct hfsc_sched *q = qdisc_priv(sch); struct hfsc_class *cl; struct sk_buff *skb; u64 cur_time; unsigned int next_len; int realtime = 0; if (sch->q.qlen == 0) return NULL; cur_time = psched_get_time(); /* * if there are eligible classes, use real-time criteria. * find the class with the minimum deadline among * the eligible classes. */ cl = eltree_get_mindl(q, cur_time); if (cl) { realtime = 1; } else { /* * use link-sharing criteria * get the class with the minimum vt in the hierarchy */ cl = vttree_get_minvt(&q->root, cur_time); if (cl == NULL) { qdisc_qstats_overlimit(sch); hfsc_schedule_watchdog(sch); return NULL; } } skb = qdisc_dequeue_peeked(cl->qdisc); if (skb == NULL) { qdisc_warn_nonwc("HFSC", cl->qdisc); return NULL; } bstats_update(&cl->bstats, skb); update_vf(cl, qdisc_pkt_len(skb), cur_time); if (realtime) cl->cl_cumul += qdisc_pkt_len(skb); if (cl->cl_flags & HFSC_RSC) { if (cl->qdisc->q.qlen != 0) { /* update ed */ next_len = qdisc_peek_len(cl->qdisc); if (realtime) update_ed(cl, next_len); else update_d(cl, next_len); } else { /* the class becomes passive */ eltree_remove(cl); } } qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static const struct Qdisc_class_ops hfsc_class_ops = { .change = hfsc_change_class, .delete = hfsc_delete_class, .graft = hfsc_graft_class, .leaf = hfsc_class_leaf, .qlen_notify = hfsc_qlen_notify, .find = hfsc_search_class, .bind_tcf = hfsc_bind_tcf, .unbind_tcf = hfsc_unbind_tcf, .tcf_block = hfsc_tcf_block, .dump = hfsc_dump_class, .dump_stats = hfsc_dump_class_stats, .walk = hfsc_walk }; static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .id = "hfsc", .init = hfsc_init_qdisc, .change = hfsc_change_qdisc, .reset = hfsc_reset_qdisc, .destroy = hfsc_destroy_qdisc, .dump = hfsc_dump_qdisc, .enqueue = hfsc_enqueue, .dequeue = hfsc_dequeue, .peek = qdisc_peek_dequeued, .cl_ops = &hfsc_class_ops, .priv_size = sizeof(struct hfsc_sched), .owner = THIS_MODULE }; MODULE_ALIAS_NET_SCH("hfsc"); static int __init hfsc_init(void) { return register_qdisc(&hfsc_qdisc_ops); } static void __exit hfsc_cleanup(void) { unregister_qdisc(&hfsc_qdisc_ops); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Hierarchical Fair Service Curve scheduler"); module_init(hfsc_init); module_exit(hfsc_cleanup); |
2 3 1 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Driver for Zarlink DVB-T ZL10353 demodulator * * Copyright (C) 2006, 2007 Christopher Pascoe <c.pascoe@itee.uq.edu.au> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/slab.h> #include <asm/div64.h> #include <media/dvb_frontend.h> #include "zl10353_priv.h" #include "zl10353.h" struct zl10353_state { struct i2c_adapter *i2c; struct dvb_frontend frontend; struct zl10353_config config; u32 bandwidth; u32 ucblocks; u32 frequency; }; static int debug; #define dprintk(args...) \ do { \ if (debug) printk(KERN_DEBUG "zl10353: " args); \ } while (0) static int debug_regs; static int zl10353_single_write(struct dvb_frontend *fe, u8 reg, u8 val) { struct zl10353_state *state = fe->demodulator_priv; u8 buf[2] = { reg, val }; struct i2c_msg msg = { .addr = state->config.demod_address, .flags = 0, .buf = buf, .len = 2 }; int err = i2c_transfer(state->i2c, &msg, 1); if (err != 1) { printk("zl10353: write to reg %x failed (err = %d)!\n", reg, err); return err; } return 0; } static int zl10353_write(struct dvb_frontend *fe, const u8 ibuf[], int ilen) { int err, i; for (i = 0; i < ilen - 1; i++) if ((err = zl10353_single_write(fe, ibuf[0] + i, ibuf[i + 1]))) return err; return 0; } static int zl10353_read_register(struct zl10353_state *state, u8 reg) { int ret; u8 b0[1] = { reg }; u8 b1[1] = { 0 }; struct i2c_msg msg[2] = { { .addr = state->config.demod_address, .flags = 0, .buf = b0, .len = 1 }, { .addr = state->config.demod_address, .flags = I2C_M_RD, .buf = b1, .len = 1 } }; ret = i2c_transfer(state->i2c, msg, 2); if (ret != 2) { printk("%s: readreg error (reg=%d, ret==%i)\n", __func__, reg, ret); return ret; } return b1[0]; } static void zl10353_dump_regs(struct dvb_frontend *fe) { struct zl10353_state *state = fe->demodulator_priv; int ret; u8 reg; /* Dump all registers. */ for (reg = 0; ; reg++) { if (reg % 16 == 0) { if (reg) printk(KERN_CONT "\n"); printk(KERN_DEBUG "%02x:", reg); } ret = zl10353_read_register(state, reg); if (ret >= 0) printk(KERN_CONT " %02x", (u8)ret); else printk(KERN_CONT " --"); if (reg == 0xff) break; } printk(KERN_CONT "\n"); } static void zl10353_calc_nominal_rate(struct dvb_frontend *fe, u32 bandwidth, u16 *nominal_rate) { struct zl10353_state *state = fe->demodulator_priv; u32 adc_clock = 450560; /* 45.056 MHz */ u64 value; u8 bw = bandwidth / 1000000; if (state->config.adc_clock) adc_clock = state->config.adc_clock; value = (u64)10 * (1 << 23) / 7 * 125; value = (bw * value) + adc_clock / 2; *nominal_rate = div_u64(value, adc_clock); dprintk("%s: bw %d, adc_clock %d => 0x%x\n", __func__, bw, adc_clock, *nominal_rate); } static void zl10353_calc_input_freq(struct dvb_frontend *fe, u16 *input_freq) { struct zl10353_state *state = fe->demodulator_priv; u32 adc_clock = 450560; /* 45.056 MHz */ int if2 = 361667; /* 36.1667 MHz */ int ife; u64 value; if (state->config.adc_clock) adc_clock = state->config.adc_clock; if (state->config.if2) if2 = state->config.if2; if (adc_clock >= if2 * 2) ife = if2; else { ife = adc_clock - (if2 % adc_clock); if (ife > adc_clock / 2) ife = adc_clock - ife; } value = div_u64((u64)65536 * ife + adc_clock / 2, adc_clock); *input_freq = -value; dprintk("%s: if2 %d, ife %d, adc_clock %d => %d / 0x%x\n", __func__, if2, ife, adc_clock, -(int)value, *input_freq); } static int zl10353_sleep(struct dvb_frontend *fe) { static u8 zl10353_softdown[] = { 0x50, 0x0C, 0x44 }; zl10353_write(fe, zl10353_softdown, sizeof(zl10353_softdown)); return 0; } static int zl10353_set_parameters(struct dvb_frontend *fe) { struct dtv_frontend_properties *c = &fe->dtv_property_cache; struct zl10353_state *state = fe->demodulator_priv; u16 nominal_rate, input_freq; u8 pllbuf[6] = { 0x67 }, acq_ctl = 0; u16 tps = 0; state->frequency = c->frequency; zl10353_single_write(fe, RESET, 0x80); udelay(200); zl10353_single_write(fe, 0xEA, 0x01); udelay(200); zl10353_single_write(fe, 0xEA, 0x00); zl10353_single_write(fe, AGC_TARGET, 0x28); if (c->transmission_mode != TRANSMISSION_MODE_AUTO) acq_ctl |= (1 << 0); if (c->guard_interval != GUARD_INTERVAL_AUTO) acq_ctl |= (1 << 1); zl10353_single_write(fe, ACQ_CTL, acq_ctl); switch (c->bandwidth_hz) { case 6000000: /* These are extrapolated from the 7 and 8MHz values */ zl10353_single_write(fe, MCLK_RATIO, 0x97); zl10353_single_write(fe, 0x64, 0x34); zl10353_single_write(fe, 0xcc, 0xdd); break; case 7000000: zl10353_single_write(fe, MCLK_RATIO, 0x86); zl10353_single_write(fe, 0x64, 0x35); zl10353_single_write(fe, 0xcc, 0x73); break; default: c->bandwidth_hz = 8000000; fallthrough; case 8000000: zl10353_single_write(fe, MCLK_RATIO, 0x75); zl10353_single_write(fe, 0x64, 0x36); zl10353_single_write(fe, 0xcc, 0x73); } zl10353_calc_nominal_rate(fe, c->bandwidth_hz, &nominal_rate); zl10353_single_write(fe, TRL_NOMINAL_RATE_1, msb(nominal_rate)); zl10353_single_write(fe, TRL_NOMINAL_RATE_0, lsb(nominal_rate)); state->bandwidth = c->bandwidth_hz; zl10353_calc_input_freq(fe, &input_freq); zl10353_single_write(fe, INPUT_FREQ_1, msb(input_freq)); zl10353_single_write(fe, INPUT_FREQ_0, lsb(input_freq)); /* Hint at TPS settings */ switch (c->code_rate_HP) { case FEC_2_3: tps |= (1 << 7); break; case FEC_3_4: tps |= (2 << 7); break; case FEC_5_6: tps |= (3 << 7); break; case FEC_7_8: tps |= (4 << 7); break; case FEC_1_2: case FEC_AUTO: break; default: return -EINVAL; } switch (c->code_rate_LP) { case FEC_2_3: tps |= (1 << 4); break; case FEC_3_4: tps |= (2 << 4); break; case FEC_5_6: tps |= (3 << 4); break; case FEC_7_8: tps |= (4 << 4); break; case FEC_1_2: case FEC_AUTO: break; case FEC_NONE: if (c->hierarchy == HIERARCHY_AUTO || c->hierarchy == HIERARCHY_NONE) break; fallthrough; default: return -EINVAL; } switch (c->modulation) { case QPSK: break; case QAM_AUTO: case QAM_16: tps |= (1 << 13); break; case QAM_64: tps |= (2 << 13); break; default: return -EINVAL; } switch (c->transmission_mode) { case TRANSMISSION_MODE_2K: case TRANSMISSION_MODE_AUTO: break; case TRANSMISSION_MODE_8K: tps |= (1 << 0); break; default: return -EINVAL; } switch (c->guard_interval) { case GUARD_INTERVAL_1_32: case GUARD_INTERVAL_AUTO: break; case GUARD_INTERVAL_1_16: tps |= (1 << 2); break; case GUARD_INTERVAL_1_8: tps |= (2 << 2); break; case GUARD_INTERVAL_1_4: tps |= (3 << 2); break; default: return -EINVAL; } switch (c->hierarchy) { case HIERARCHY_AUTO: case HIERARCHY_NONE: break; case HIERARCHY_1: tps |= (1 << 10); break; case HIERARCHY_2: tps |= (2 << 10); break; case HIERARCHY_4: tps |= (3 << 10); break; default: return -EINVAL; } zl10353_single_write(fe, TPS_GIVEN_1, msb(tps)); zl10353_single_write(fe, TPS_GIVEN_0, lsb(tps)); if (fe->ops.i2c_gate_ctrl) fe->ops.i2c_gate_ctrl(fe, 0); /* * If there is no tuner attached to the secondary I2C bus, we call * set_params to program a potential tuner attached somewhere else. * Otherwise, we update the PLL registers via calc_regs. */ if (state->config.no_tuner) { if (fe->ops.tuner_ops.set_params) { fe->ops.tuner_ops.set_params(fe); if (fe->ops.i2c_gate_ctrl) fe->ops.i2c_gate_ctrl(fe, 0); } } else if (fe->ops.tuner_ops.calc_regs) { fe->ops.tuner_ops.calc_regs(fe, pllbuf + 1, 5); pllbuf[1] <<= 1; zl10353_write(fe, pllbuf, sizeof(pllbuf)); } zl10353_single_write(fe, 0x5F, 0x13); /* If no attached tuner or invalid PLL registers, just start the FSM. */ if (state->config.no_tuner || fe->ops.tuner_ops.calc_regs == NULL) zl10353_single_write(fe, FSM_GO, 0x01); else zl10353_single_write(fe, TUNER_GO, 0x01); return 0; } static int zl10353_get_parameters(struct dvb_frontend *fe, struct dtv_frontend_properties *c) { struct zl10353_state *state = fe->demodulator_priv; int s6, s9; u16 tps; static const u8 tps_fec_to_api[8] = { FEC_1_2, FEC_2_3, FEC_3_4, FEC_5_6, FEC_7_8, FEC_AUTO, FEC_AUTO, FEC_AUTO }; s6 = zl10353_read_register(state, STATUS_6); s9 = zl10353_read_register(state, STATUS_9); if (s6 < 0 || s9 < 0) return -EREMOTEIO; if ((s6 & (1 << 5)) == 0 || (s9 & (1 << 4)) == 0) return -EINVAL; /* no FE or TPS lock */ tps = zl10353_read_register(state, TPS_RECEIVED_1) << 8 | zl10353_read_register(state, TPS_RECEIVED_0); c->code_rate_HP = tps_fec_to_api[(tps >> 7) & 7]; c->code_rate_LP = tps_fec_to_api[(tps >> 4) & 7]; switch ((tps >> 13) & 3) { case 0: c->modulation = QPSK; break; case 1: c->modulation = QAM_16; break; case 2: c->modulation = QAM_64; break; default: c->modulation = QAM_AUTO; break; } c->transmission_mode = (tps & 0x01) ? TRANSMISSION_MODE_8K : TRANSMISSION_MODE_2K; switch ((tps >> 2) & 3) { case 0: c->guard_interval = GUARD_INTERVAL_1_32; break; case 1: c->guard_interval = GUARD_INTERVAL_1_16; break; case 2: c->guard_interval = GUARD_INTERVAL_1_8; break; case 3: c->guard_interval = GUARD_INTERVAL_1_4; break; default: c->guard_interval = GUARD_INTERVAL_AUTO; break; } switch ((tps >> 10) & 7) { case 0: c->hierarchy = HIERARCHY_NONE; break; case 1: c->hierarchy = HIERARCHY_1; break; case 2: c->hierarchy = HIERARCHY_2; break; case 3: c->hierarchy = HIERARCHY_4; break; default: c->hierarchy = HIERARCHY_AUTO; break; } c->frequency = state->frequency; c->bandwidth_hz = state->bandwidth; c->inversion = INVERSION_AUTO; return 0; } static int zl10353_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct zl10353_state *state = fe->demodulator_priv; int s6, s7, s8; if ((s6 = zl10353_read_register(state, STATUS_6)) < 0) return -EREMOTEIO; if ((s7 = zl10353_read_register(state, STATUS_7)) < 0) return -EREMOTEIO; if ((s8 = zl10353_read_register(state, STATUS_8)) < 0) return -EREMOTEIO; *status = 0; if (s6 & (1 << 2)) *status |= FE_HAS_CARRIER; if (s6 & (1 << 1)) *status |= FE_HAS_VITERBI; if (s6 & (1 << 5)) *status |= FE_HAS_LOCK; if (s7 & (1 << 4)) *status |= FE_HAS_SYNC; if (s8 & (1 << 6)) *status |= FE_HAS_SIGNAL; if ((*status & (FE_HAS_CARRIER | FE_HAS_VITERBI | FE_HAS_SYNC)) != (FE_HAS_CARRIER | FE_HAS_VITERBI | FE_HAS_SYNC)) *status &= ~FE_HAS_LOCK; return 0; } static int zl10353_read_ber(struct dvb_frontend *fe, u32 *ber) { struct zl10353_state *state = fe->demodulator_priv; *ber = zl10353_read_register(state, RS_ERR_CNT_2) << 16 | zl10353_read_register(state, RS_ERR_CNT_1) << 8 | zl10353_read_register(state, RS_ERR_CNT_0); return 0; } static int zl10353_read_signal_strength(struct dvb_frontend *fe, u16 *strength) { struct zl10353_state *state = fe->demodulator_priv; u16 signal = zl10353_read_register(state, AGC_GAIN_1) << 10 | zl10353_read_register(state, AGC_GAIN_0) << 2 | 3; *strength = ~signal; return 0; } static int zl10353_read_snr(struct dvb_frontend *fe, u16 *snr) { struct zl10353_state *state = fe->demodulator_priv; u8 _snr; if (debug_regs) zl10353_dump_regs(fe); _snr = zl10353_read_register(state, SNR); *snr = 10 * _snr / 8; return 0; } static int zl10353_read_ucblocks(struct dvb_frontend *fe, u32 *ucblocks) { struct zl10353_state *state = fe->demodulator_priv; u32 ubl = 0; ubl = zl10353_read_register(state, RS_UBC_1) << 8 | zl10353_read_register(state, RS_UBC_0); state->ucblocks += ubl; *ucblocks = state->ucblocks; return 0; } static int zl10353_get_tune_settings(struct dvb_frontend *fe, struct dvb_frontend_tune_settings *fe_tune_settings) { fe_tune_settings->min_delay_ms = 1000; fe_tune_settings->step_size = 0; fe_tune_settings->max_drift = 0; return 0; } static int zl10353_init(struct dvb_frontend *fe) { struct zl10353_state *state = fe->demodulator_priv; u8 zl10353_reset_attach[6] = { 0x50, 0x03, 0x64, 0x46, 0x15, 0x0F }; if (debug_regs) zl10353_dump_regs(fe); if (state->config.parallel_ts) zl10353_reset_attach[2] &= ~0x20; if (state->config.clock_ctl_1) zl10353_reset_attach[3] = state->config.clock_ctl_1; if (state->config.pll_0) zl10353_reset_attach[4] = state->config.pll_0; /* Do a "hard" reset if not already done */ if (zl10353_read_register(state, 0x50) != zl10353_reset_attach[1] || zl10353_read_register(state, 0x51) != zl10353_reset_attach[2]) { zl10353_write(fe, zl10353_reset_attach, sizeof(zl10353_reset_attach)); if (debug_regs) zl10353_dump_regs(fe); } return 0; } static int zl10353_i2c_gate_ctrl(struct dvb_frontend* fe, int enable) { struct zl10353_state *state = fe->demodulator_priv; u8 val = 0x0a; if (state->config.disable_i2c_gate_ctrl) { /* No tuner attached to the internal I2C bus */ /* If set enable I2C bridge, the main I2C bus stopped hardly */ return 0; } if (enable) val |= 0x10; return zl10353_single_write(fe, 0x62, val); } static void zl10353_release(struct dvb_frontend *fe) { struct zl10353_state *state = fe->demodulator_priv; kfree(state); } static const struct dvb_frontend_ops zl10353_ops; struct dvb_frontend *zl10353_attach(const struct zl10353_config *config, struct i2c_adapter *i2c) { struct zl10353_state *state = NULL; int id; /* allocate memory for the internal state */ state = kzalloc(sizeof(struct zl10353_state), GFP_KERNEL); if (state == NULL) goto error; /* setup the state */ state->i2c = i2c; memcpy(&state->config, config, sizeof(struct zl10353_config)); /* check if the demod is there */ id = zl10353_read_register(state, CHIP_ID); if ((id != ID_ZL10353) && (id != ID_CE6230) && (id != ID_CE6231)) goto error; /* create dvb_frontend */ memcpy(&state->frontend.ops, &zl10353_ops, sizeof(struct dvb_frontend_ops)); state->frontend.demodulator_priv = state; return &state->frontend; error: kfree(state); return NULL; } static const struct dvb_frontend_ops zl10353_ops = { .delsys = { SYS_DVBT }, .info = { .name = "Zarlink ZL10353 DVB-T", .frequency_min_hz = 174 * MHz, .frequency_max_hz = 862 * MHz, .frequency_stepsize_hz = 166667, .caps = FE_CAN_FEC_1_2 | FE_CAN_FEC_2_3 | FE_CAN_FEC_3_4 | FE_CAN_FEC_5_6 | FE_CAN_FEC_7_8 | FE_CAN_FEC_AUTO | FE_CAN_QPSK | FE_CAN_QAM_16 | FE_CAN_QAM_64 | FE_CAN_QAM_AUTO | FE_CAN_TRANSMISSION_MODE_AUTO | FE_CAN_GUARD_INTERVAL_AUTO | FE_CAN_HIERARCHY_AUTO | FE_CAN_RECOVER | FE_CAN_MUTE_TS }, .release = zl10353_release, .init = zl10353_init, .sleep = zl10353_sleep, .i2c_gate_ctrl = zl10353_i2c_gate_ctrl, .write = zl10353_write, .set_frontend = zl10353_set_parameters, .get_frontend = zl10353_get_parameters, .get_tune_settings = zl10353_get_tune_settings, .read_status = zl10353_read_status, .read_ber = zl10353_read_ber, .read_signal_strength = zl10353_read_signal_strength, .read_snr = zl10353_read_snr, .read_ucblocks = zl10353_read_ucblocks, }; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Turn on/off frontend debugging (default:off)."); module_param(debug_regs, int, 0644); MODULE_PARM_DESC(debug_regs, "Turn on/off frontend register dumps (default:off)."); MODULE_DESCRIPTION("Zarlink ZL10353 DVB-T demodulator driver"); MODULE_AUTHOR("Chris Pascoe"); MODULE_LICENSE("GPL"); EXPORT_SYMBOL_GPL(zl10353_attach); |
742 743 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | // SPDX-License-Identifier: GPL-2.0-or-later /* Paravirtualization interfaces Copyright (C) 2006 Rusty Russell IBM Corporation 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ #include <linux/errno.h> #include <linux/init.h> #include <linux/export.h> #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> #include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> #include <asm/delay.h> #include <asm/fixmap.h> #include <asm/apic.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } #ifdef CONFIG_PARAVIRT_XXL DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_enable(&virt_spin_lock_key); } static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) { tlb_remove_page(tlb, table); } struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { return 0; } DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)) { static_call_update(pv_sched_clock, func); } /* These are in entry.S */ static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, .name = "paravirt-ioport", .flags = IORESOURCE_IO | IORESOURCE_BUSY, }; /* * Reserve the whole legacy IO space to prevent any legacy drivers * from wasting time probing for their hardware. This is a fairly * brute-force approach to disabling all non-virtual drivers. * * Note that this must be called very early to have any effect. */ int paravirt_disable_iospace(void) { return request_resource(&ioport_resource, &reserve_ioports); } #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); } static noinstr unsigned long pv_native_get_debugreg(int regno) { return native_get_debugreg(regno); } static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } noinstr void pv_native_wbinvd(void) { native_wbinvd(); } static noinstr void pv_native_safe_halt(void) { native_safe_halt(); } #endif struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL .extra_user_64bit_cs = __USER_CS, #endif }; /* 64-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { /* Cpu ops. */ .cpu.io_delay = native_io_delay, #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, .cpu.get_debugreg = pv_native_get_debugreg, .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, .cpu.wbinvd = pv_native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, .cpu.write_msr_safe = native_write_msr_safe, .cpu.read_pmc = native_read_pmc, .cpu.load_tr_desc = native_load_tr_desc, .cpu.set_ldt = native_set_ldt, .cpu.load_gdt = native_load_gdt, .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, .cpu.load_gs_index = native_load_gs_index, .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, .cpu.alloc_ldt = paravirt_nop, .cpu.free_ldt = paravirt_nop, .cpu.load_sp0 = native_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, .mmu.tlb_remove_table = native_tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, .mmu.alloc_pte = paravirt_nop, .mmu.alloc_pmd = paravirt_nop, .mmu.alloc_pud = paravirt_nop, .mmu.alloc_p4d = paravirt_nop, .mmu.release_pte = paravirt_nop, .mmu.release_pmd = paravirt_nop, .mmu.release_pud = paravirt_nop, .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, .mmu.set_p4d = native_set_p4d, #if CONFIG_PGTABLE_LEVELS >= 5 .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, .mmu.make_pte = PTE_IDENT, .mmu.make_pgd = PTE_IDENT, .mmu.enter_mmap = paravirt_nop, .mmu.lazy_mode = { .enter = paravirt_nop, .leave = paravirt_nop, .flush = paravirt_nop, }, .mmu.set_fixmap = native_set_fixmap, #endif /* CONFIG_PARAVIRT_XXL */ #if defined(CONFIG_PARAVIRT_SPINLOCKS) /* Lock ops. */ #ifdef CONFIG_SMP .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .lock.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .lock.wait = paravirt_nop, .lock.kick = paravirt_nop, .lock.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ #endif }; #ifdef CONFIG_PARAVIRT_XXL NOKPROBE_SYMBOL(native_load_idt); #endif EXPORT_SYMBOL(pv_ops); EXPORT_SYMBOL_GPL(pv_info); |
5 5 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Framing Layer. * * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/crc-ccitt.h> #include <linux/netdevice.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cffrml.h> #define container_obj(layr) container_of(layr, struct cffrml, layer) struct cffrml { struct cflayer layer; bool dofcs; /* !< FCS active */ int __percpu *pcpu_refcnt; }; static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt); static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); static u32 cffrml_rcv_error; static u32 cffrml_rcv_checsum_error; struct cflayer *cffrml_create(u16 phyid, bool use_fcs) { struct cffrml *this = kzalloc(sizeof(struct cffrml), GFP_ATOMIC); if (!this) return NULL; this->pcpu_refcnt = alloc_percpu(int); if (this->pcpu_refcnt == NULL) { kfree(this); return NULL; } caif_assert(offsetof(struct cffrml, layer) == 0); this->layer.receive = cffrml_receive; this->layer.transmit = cffrml_transmit; this->layer.ctrlcmd = cffrml_ctrlcmd; snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid); this->dofcs = use_fcs; this->layer.id = phyid; return (struct cflayer *) this; } void cffrml_free(struct cflayer *layer) { struct cffrml *this = container_obj(layer); free_percpu(this->pcpu_refcnt); kfree(layer); } void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up) { this->up = up; } void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn) { this->dn = dn; } static u16 cffrml_checksum(u16 chks, void *buf, u16 len) { /* FIXME: FCS should be moved to glue in order to use OS-Specific * solutions */ return crc_ccitt(chks, buf, len); } static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) { u16 tmp; u16 len; u16 hdrchks; int pktchks; struct cffrml *this; this = container_obj(layr); cfpkt_extr_head(pkt, &tmp, 2); len = le16_to_cpu(tmp); /* Subtract for FCS on length if FCS is not used. */ if (!this->dofcs) len -= 2; if (cfpkt_setlen(pkt, len) < 0) { ++cffrml_rcv_error; pr_err("Framing length error (%d)\n", len); cfpkt_destroy(pkt); return -EPROTO; } /* * Don't do extract if FCS is false, rather do setlen - then we don't * get a cache-miss. */ if (this->dofcs) { cfpkt_extr_trail(pkt, &tmp, 2); hdrchks = le16_to_cpu(tmp); pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); if (pktchks != hdrchks) { cfpkt_add_trail(pkt, &tmp, 2); ++cffrml_rcv_error; ++cffrml_rcv_checsum_error; pr_info("Frame checksum error (0x%x != 0x%x)\n", hdrchks, pktchks); return -EILSEQ; } } if (cfpkt_erroneous(pkt)) { ++cffrml_rcv_error; pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->up == NULL) { pr_err("Layr up is missing!\n"); cfpkt_destroy(pkt); return -EINVAL; } return layr->up->receive(layr->up, pkt); } static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) { u16 chks; u16 len; __le16 data; struct cffrml *this = container_obj(layr); if (this->dofcs) { chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); data = cpu_to_le16(chks); cfpkt_add_trail(pkt, &data, 2); } else { cfpkt_pad_trail(pkt, 2); } len = cfpkt_getlen(pkt); data = cpu_to_le16(len); cfpkt_add_head(pkt, &data, 2); cfpkt_info(pkt)->hdr_len += 2; if (cfpkt_erroneous(pkt)) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->dn == NULL) { cfpkt_destroy(pkt); return -ENODEV; } return layr->dn->transmit(layr->dn, pkt); } static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); } void cffrml_put(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_dec(*this->pcpu_refcnt); } void cffrml_hold(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_inc(*this->pcpu_refcnt); } int cffrml_refcnt_read(struct cflayer *layr) { int i, refcnt = 0; struct cffrml *this = container_obj(layr); for_each_possible_cpu(i) refcnt += *per_cpu_ptr(this->pcpu_refcnt, i); return refcnt; } |
2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 | // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/parser.h> #include <linux/hashtable.h> #include <linux/seq_file.h> /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ static struct kmem_cache *orangefs_inode_cache; /* list for storing orangefs specific superblocks in use */ LIST_HEAD(orangefs_superblocks); DEFINE_SPINLOCK(orangefs_superblocks_lock); enum { Opt_intr, Opt_acl, Opt_local_lock, Opt_err }; static const match_table_t tokens = { { Opt_acl, "acl" }, { Opt_intr, "intr" }, { Opt_local_lock, "local_lock" }, { Opt_err, NULL } }; uint64_t orangefs_features; static int orangefs_show_options(struct seq_file *m, struct dentry *root) { struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb); if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); if (orangefs_sb->flags & ORANGEFS_OPT_INTR) seq_puts(m, ",intr"); if (orangefs_sb->flags & ORANGEFS_OPT_LOCAL_LOCK) seq_puts(m, ",local_lock"); return 0; } static int parse_mount_options(struct super_block *sb, char *options, int silent) { struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); substring_t args[MAX_OPT_ARGS]; char *p; /* * Force any potential flags that might be set from the mount * to zero, ie, initialize to unset. */ sb->s_flags &= ~SB_POSIXACL; orangefs_sb->flags &= ~ORANGEFS_OPT_INTR; orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_acl: sb->s_flags |= SB_POSIXACL; break; case Opt_intr: orangefs_sb->flags |= ORANGEFS_OPT_INTR; break; case Opt_local_lock: orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; break; default: goto fail; } } return 0; fail: if (!silent) gossip_err("Error: mount option [%s] is not supported.\n", p); return -EINVAL; } static void orangefs_inode_cache_ctor(void *req) { struct orangefs_inode_s *orangefs_inode = req; inode_init_once(&orangefs_inode->vfs_inode); init_rwsem(&orangefs_inode->xattr_sem); } static struct inode *orangefs_alloc_inode(struct super_block *sb) { struct orangefs_inode_s *orangefs_inode; orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL); if (!orangefs_inode) return NULL; /* * We want to clear everything except for rw_semaphore and the * vfs_inode. */ memset(&orangefs_inode->refn.khandle, 0, 16); orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL; orangefs_inode->last_failed_block_index_read = 0; memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target)); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_alloc_inode: allocated %p\n", &orangefs_inode->vfs_inode); return &orangefs_inode->vfs_inode; } static void orangefs_free_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_cached_xattr *cx; struct hlist_node *tmp; int i; hash_for_each_safe(orangefs_inode->xattr_cache, i, tmp, cx, node) { hlist_del(&cx->node); kfree(cx); } kmem_cache_free(orangefs_inode_cache, orangefs_inode); } static void orangefs_destroy_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); gossip_debug(GOSSIP_SUPER_DEBUG, "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); } static int orangefs_write_inode(struct inode *inode, struct writeback_control *wbc) { gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n"); return orangefs_inode_setattr(inode); } /* * NOTE: information filled in here is typically reflected in the * output of the system command 'df' */ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) { int ret = -ENOMEM; struct orangefs_kernel_op_s *new_op = NULL; int flags = 0; struct super_block *sb = NULL; sb = dentry->d_sb; gossip_debug(GOSSIP_SUPER_DEBUG, "%s: called on sb %p (fs_id is %d)\n", __func__, sb, (int)(ORANGEFS_SB(sb)->fs_id)); new_op = op_alloc(ORANGEFS_VFS_OP_STATFS); if (!new_op) return ret; new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id; if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR) flags = ORANGEFS_OP_INTERRUPTIBLE; ret = service_operation(new_op, "orangefs_statfs", flags); if (new_op->downcall.status < 0) goto out_op_release; gossip_debug(GOSSIP_SUPER_DEBUG, "%s: got %ld blocks available | " "%ld blocks total | %ld block size | " "%ld files total | %ld files avail\n", __func__, (long)new_op->downcall.resp.statfs.blocks_avail, (long)new_op->downcall.resp.statfs.blocks_total, (long)new_op->downcall.resp.statfs.block_size, (long)new_op->downcall.resp.statfs.files_total, (long)new_op->downcall.resp.statfs.files_avail); buf->f_type = sb->s_magic; buf->f_fsid.val[0] = ORANGEFS_SB(sb)->fs_id; buf->f_fsid.val[1] = ORANGEFS_SB(sb)->id; buf->f_bsize = new_op->downcall.resp.statfs.block_size; buf->f_namelen = ORANGEFS_NAME_MAX; buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total; buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total; buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail; buf->f_frsize = 0; out_op_release: op_release(new_op); gossip_debug(GOSSIP_SUPER_DEBUG, "%s: returning %d\n", __func__, ret); return ret; } /* * Remount as initiated by VFS layer. We just need to reparse the mount * options, no need to signal pvfs2-client-core about it. */ static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data) { gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n"); return parse_mount_options(sb, data, 1); } /* * Remount as initiated by pvfs2-client-core on restart. This is used to * repopulate mount information left from previous pvfs2-client-core. * * the idea here is that given a valid superblock, we're * re-initializing the user space client with the initial mount * information specified when the super block was first initialized. * this is very different than the first initialization/creation of a * superblock. we use the special service_priority_operation to make * sure that the mount gets ahead of any other pending operation that * is waiting for servicing. this means that the pvfs2-client won't * fail to start several times for all other pending operations before * the client regains all of the mount information from us. * NOTE: this function assumes that the request_mutex is already acquired! */ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb) { struct orangefs_kernel_op_s *new_op; int ret = -EINVAL; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n"); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) return -ENOMEM; strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, orangefs_sb->devname); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Remount via host %s\n", new_op->upcall.req.fs_mount.orangefs_config_server); /* * we assume that the calling function has already acquired the * request_mutex to prevent other operations from bypassing * this one */ ret = service_operation(new_op, "orangefs_remount", ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: mount got return value of %d\n", ret); if (ret == 0) { /* * store the id assigned to this sb -- it's just a * short-lived mapping that the system interface uses * to map this superblock to a particular mount entry */ orangefs_sb->id = new_op->downcall.resp.fs_mount.id; orangefs_sb->mount_pending = 0; } op_release(new_op); if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); if (!ret) orangefs_features = new_op->downcall.resp.features.features; else orangefs_features = 0; op_release(new_op); } else { orangefs_features = 0; } return ret; } int fsid_key_table_initialize(void) { return 0; } void fsid_key_table_finalize(void) { } static const struct super_operations orangefs_s_ops = { .alloc_inode = orangefs_alloc_inode, .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, .remount_fs = orangefs_remount_fs, .show_options = orangefs_show_options, }; static struct dentry *orangefs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct orangefs_object_kref refn; if (fh_len < 5 || fh_type > 2) return NULL; ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16); refn.fs_id = (u32) fid->raw[4]; gossip_debug(GOSSIP_SUPER_DEBUG, "fh_to_dentry: handle %pU, fs_id %d\n", &refn.khandle, refn.fs_id); return d_obtain_alias(orangefs_iget(sb, &refn)); } static int orangefs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { int len = parent ? 10 : 5; int type = 1; struct orangefs_object_kref refn; if (*max_len < len) { gossip_err("fh buffer is too small for encoding\n"); *max_len = len; type = 255; goto out; } refn = ORANGEFS_I(inode)->refn; ORANGEFS_khandle_to(&refn.khandle, fh, 16); fh[4] = refn.fs_id; gossip_debug(GOSSIP_SUPER_DEBUG, "Encoding fh: handle %pU, fsid %u\n", &refn.khandle, refn.fs_id); if (parent) { refn = ORANGEFS_I(parent)->refn; ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16); fh[9] = refn.fs_id; type = 2; gossip_debug(GOSSIP_SUPER_DEBUG, "Encoding parent: handle %pU, fsid %u\n", &refn.khandle, refn.fs_id); } *max_len = len; out: return type; } static const struct export_operations orangefs_export_ops = { .encode_fh = orangefs_encode_fh, .fh_to_dentry = orangefs_fh_to_dentry, }; static int orangefs_unmount(int id, __s32 fs_id, const char *devname) { struct orangefs_kernel_op_s *op; int r; op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); if (!op) return -ENOMEM; op->upcall.req.fs_umount.id = id; op->upcall.req.fs_umount.fs_id = fs_id; strscpy(op->upcall.req.fs_umount.orangefs_config_server, devname); r = service_operation(op, "orangefs_fs_umount", 0); /* Not much to do about an error here. */ if (r) gossip_err("orangefs_unmount: service_operation %d\n", r); op_release(op); return r; } static int orangefs_fill_sb(struct super_block *sb, struct orangefs_fs_mount_response *fs_mount, void *data, int silent) { int ret; struct inode *root; struct dentry *root_dentry; struct orangefs_object_kref root_object; ORANGEFS_SB(sb)->sb = sb; ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle; ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id; ORANGEFS_SB(sb)->id = fs_mount->id; if (data) { ret = parse_mount_options(sb, data, silent); if (ret) return ret; } /* Hang the xattr handlers off the superblock */ sb->s_xattr = orangefs_xattr_handlers; sb->s_magic = ORANGEFS_SUPER_MAGIC; sb->s_op = &orangefs_s_ops; sb->s_d_op = &orangefs_dentry_operations; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; ret = super_setup_bdi(sb); if (ret) return ret; root_object.khandle = ORANGEFS_SB(sb)->root_khandle; root_object.fs_id = ORANGEFS_SB(sb)->fs_id; gossip_debug(GOSSIP_SUPER_DEBUG, "get inode %pU, fsid %d\n", &root_object.khandle, root_object.fs_id); root = orangefs_iget(sb, &root_object); if (IS_ERR(root)) return PTR_ERR(root); gossip_debug(GOSSIP_SUPER_DEBUG, "Allocated root inode [%p] with mode %x\n", root, root->i_mode); /* allocates and places root dentry in dcache */ root_dentry = d_make_root(root); if (!root_dentry) return -ENOMEM; sb->s_export_op = &orangefs_export_ops; sb->s_root = root_dentry; return 0; } struct dentry *orangefs_mount(struct file_system_type *fst, int flags, const char *devname, void *data) { int ret; struct super_block *sb = ERR_PTR(-EINVAL); struct orangefs_kernel_op_s *new_op; struct dentry *d = ERR_PTR(-EINVAL); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: called with devname %s\n", devname); if (!devname) { gossip_err("ERROR: device name not specified.\n"); return ERR_PTR(-EINVAL); } new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) return ERR_PTR(-ENOMEM); strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, devname); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Mount via host %s\n", new_op->upcall.req.fs_mount.orangefs_config_server); ret = service_operation(new_op, "orangefs_mount", 0); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: mount got return value of %d\n", ret); if (ret) goto free_op; if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) { gossip_err("ERROR: Retrieved null fs_id\n"); ret = -EINVAL; goto free_op; } sb = sget(fst, NULL, set_anon_super, flags, NULL); if (IS_ERR(sb)) { d = ERR_CAST(sb); orangefs_unmount(new_op->downcall.resp.fs_mount.id, new_op->downcall.resp.fs_mount.fs_id, devname); goto free_op; } /* alloc and init our private orangefs sb info */ sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); if (!ORANGEFS_SB(sb)) { d = ERR_PTR(-ENOMEM); goto free_op; } ret = orangefs_fill_sb(sb, &new_op->downcall.resp.fs_mount, data, flags & SB_SILENT ? 1 : 0); if (ret) { d = ERR_PTR(ret); goto free_sb_and_op; } /* * on successful mount, store the devname and data * used */ strscpy(ORANGEFS_SB(sb)->devname, devname); /* mount_pending must be cleared */ ORANGEFS_SB(sb)->mount_pending = 0; /* * finally, add this sb to our list of known orangefs * sb's */ gossip_debug(GOSSIP_SUPER_DEBUG, "Adding SB %p to orangefs superblocks\n", ORANGEFS_SB(sb)); spin_lock(&orangefs_superblocks_lock); list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks); spin_unlock(&orangefs_superblocks_lock); op_release(new_op); /* Must be removed from the list now. */ ORANGEFS_SB(sb)->no_list = 0; if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) return ERR_PTR(-ENOMEM); new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", 0); orangefs_features = new_op->downcall.resp.features.features; op_release(new_op); } else { orangefs_features = 0; } return dget(sb->s_root); free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ ORANGEFS_SB(sb)->no_list = 1; /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); if (ret == -EINVAL) { gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n"); gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n"); } op_release(new_op); return d; } void orangefs_kill_sb(struct super_block *sb) { int r; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); /* provided sb cleanup */ kill_anon_super(sb); if (!ORANGEFS_SB(sb)) { mutex_lock(&orangefs_request_mutex); mutex_unlock(&orangefs_request_mutex); return; } /* * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock */ r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, ORANGEFS_SB(sb)->devname); if (!r) ORANGEFS_SB(sb)->mount_pending = 1; if (!ORANGEFS_SB(sb)->no_list) { /* remove the sb from our list of orangefs specific sb's */ spin_lock(&orangefs_superblocks_lock); /* not list_del_init */ __list_del_entry(&ORANGEFS_SB(sb)->list); ORANGEFS_SB(sb)->list.prev = NULL; spin_unlock(&orangefs_superblocks_lock); } /* * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us * gets completed before we free the dang thing. */ mutex_lock(&orangefs_request_mutex); mutex_unlock(&orangefs_request_mutex); /* free the orangefs superblock private data */ kfree(ORANGEFS_SB(sb)); } int orangefs_inode_cache_initialize(void) { orangefs_inode_cache = kmem_cache_create_usercopy( "orangefs_inode_cache", sizeof(struct orangefs_inode_s), 0, 0, offsetof(struct orangefs_inode_s, link_target), sizeof_field(struct orangefs_inode_s, link_target), orangefs_inode_cache_ctor); if (!orangefs_inode_cache) { gossip_err("Cannot create orangefs_inode_cache\n"); return -ENOMEM; } return 0; } int orangefs_inode_cache_finalize(void) { kmem_cache_destroy(orangefs_inode_cache); return 0; } |
3 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | // SPDX-License-Identifier: GPL-2.0+ /* * Safe Encapsulated USB Serial Driver * * Copyright (C) 2010 Johan Hovold <jhovold@gmail.com> * Copyright (C) 2001 Lineo * Copyright (C) 2001 Hewlett-Packard * * By: * Stuart Lynne <sl@lineo.com>, Tom Rushworth <tbr@lineo.com> */ /* * The encapsultaion is designed to overcome difficulties with some USB * hardware. * * While the USB protocol has a CRC over the data while in transit, i.e. while * being carried over the bus, there is no end to end protection. If the * hardware has any problems getting the data into or out of the USB transmit * and receive FIFO's then data can be lost. * * This protocol adds a two byte trailer to each USB packet to specify the * number of bytes of valid data and a 10 bit CRC that will allow the receiver * to verify that the entire USB packet was received without error. * * Because in this case the sender and receiver are the class and function * drivers there is now end to end protection. * * There is an additional option that can be used to force all transmitted * packets to be padded to the maximum packet size. This provides a work * around for some devices which have problems with small USB packets. * * Assuming a packetsize of N: * * 0..N-2 data and optional padding * * N-2 bits 7-2 - number of bytes of valid data * bits 1-0 top two bits of 10 bit CRC * N-1 bottom 8 bits of 10 bit CRC * * * | Data Length | 10 bit CRC | * + 7 . 6 . 5 . 4 . 3 . 2 . 1 . 0 | 7 . 6 . 5 . 4 . 3 . 2 . 1 . 0 + * * The 10 bit CRC is computed across the sent data, followed by the trailer * with the length set and the CRC set to zero. The CRC is then OR'd into * the trailer. * * When received a 10 bit CRC is computed over the entire frame including * the trailer and should be equal to zero. * * Two module parameters are used to control the encapsulation, if both are * turned of the module works as a simple serial device with NO * encapsulation. * * See linux/drivers/usbd/serial_fd for a device function driver * implementation of this. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> static bool safe = true; static bool padded = IS_ENABLED(CONFIG_USB_SERIAL_SAFE_PADDED); #define DRIVER_AUTHOR "sl@lineo.com, tbr@lineo.com, Johan Hovold <jhovold@gmail.com>" #define DRIVER_DESC "USB Safe Encapsulated Serial" MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); module_param(safe, bool, 0); MODULE_PARM_DESC(safe, "Turn Safe Encapsulation On/Off"); module_param(padded, bool, 0); MODULE_PARM_DESC(padded, "Pad to full wMaxPacketSize On/Off"); #define CDC_DEVICE_CLASS 0x02 #define CDC_INTERFACE_CLASS 0x02 #define CDC_INTERFACE_SUBCLASS 0x06 #define LINEO_INTERFACE_CLASS 0xff #define LINEO_INTERFACE_SUBCLASS_SAFENET 0x01 #define LINEO_SAFENET_CRC 0x01 #define LINEO_SAFENET_CRC_PADDED 0x02 #define LINEO_INTERFACE_SUBCLASS_SAFESERIAL 0x02 #define LINEO_SAFESERIAL_CRC 0x01 #define LINEO_SAFESERIAL_CRC_PADDED 0x02 #define MY_USB_DEVICE(vend, prod, dc, ic, isc) \ .match_flags = USB_DEVICE_ID_MATCH_DEVICE | \ USB_DEVICE_ID_MATCH_DEV_CLASS | \ USB_DEVICE_ID_MATCH_INT_CLASS | \ USB_DEVICE_ID_MATCH_INT_SUBCLASS, \ .idVendor = (vend), \ .idProduct = (prod),\ .bDeviceClass = (dc),\ .bInterfaceClass = (ic), \ .bInterfaceSubClass = (isc), static const struct usb_device_id id_table[] = { {MY_USB_DEVICE(0x49f, 0xffff, CDC_DEVICE_CLASS, LINEO_INTERFACE_CLASS, LINEO_INTERFACE_SUBCLASS_SAFESERIAL)}, /* Itsy */ {MY_USB_DEVICE(0x3f0, 0x2101, CDC_DEVICE_CLASS, LINEO_INTERFACE_CLASS, LINEO_INTERFACE_SUBCLASS_SAFESERIAL)}, /* Calypso */ {MY_USB_DEVICE(0x4dd, 0x8001, CDC_DEVICE_CLASS, LINEO_INTERFACE_CLASS, LINEO_INTERFACE_SUBCLASS_SAFESERIAL)}, /* Iris */ {MY_USB_DEVICE(0x4dd, 0x8002, CDC_DEVICE_CLASS, LINEO_INTERFACE_CLASS, LINEO_INTERFACE_SUBCLASS_SAFESERIAL)}, /* Collie */ {MY_USB_DEVICE(0x4dd, 0x8003, CDC_DEVICE_CLASS, LINEO_INTERFACE_CLASS, LINEO_INTERFACE_SUBCLASS_SAFESERIAL)}, /* Collie */ {MY_USB_DEVICE(0x4dd, 0x8004, CDC_DEVICE_CLASS, LINEO_INTERFACE_CLASS, LINEO_INTERFACE_SUBCLASS_SAFESERIAL)}, /* Collie */ {MY_USB_DEVICE(0x5f9, 0xffff, CDC_DEVICE_CLASS, LINEO_INTERFACE_CLASS, LINEO_INTERFACE_SUBCLASS_SAFESERIAL)}, /* Sharp tmp */ {} /* terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table); static const __u16 crc10_table[256] = { 0x000, 0x233, 0x255, 0x066, 0x299, 0x0aa, 0x0cc, 0x2ff, 0x301, 0x132, 0x154, 0x367, 0x198, 0x3ab, 0x3cd, 0x1fe, 0x031, 0x202, 0x264, 0x057, 0x2a8, 0x09b, 0x0fd, 0x2ce, 0x330, 0x103, 0x165, 0x356, 0x1a9, 0x39a, 0x3fc, 0x1cf, 0x062, 0x251, 0x237, 0x004, 0x2fb, 0x0c8, 0x0ae, 0x29d, 0x363, 0x150, 0x136, 0x305, 0x1fa, 0x3c9, 0x3af, 0x19c, 0x053, 0x260, 0x206, 0x035, 0x2ca, 0x0f9, 0x09f, 0x2ac, 0x352, 0x161, 0x107, 0x334, 0x1cb, 0x3f8, 0x39e, 0x1ad, 0x0c4, 0x2f7, 0x291, 0x0a2, 0x25d, 0x06e, 0x008, 0x23b, 0x3c5, 0x1f6, 0x190, 0x3a3, 0x15c, 0x36f, 0x309, 0x13a, 0x0f5, 0x2c6, 0x2a0, 0x093, 0x26c, 0x05f, 0x039, 0x20a, 0x3f4, 0x1c7, 0x1a1, 0x392, 0x16d, 0x35e, 0x338, 0x10b, 0x0a6, 0x295, 0x2f3, 0x0c0, 0x23f, 0x00c, 0x06a, 0x259, 0x3a7, 0x194, 0x1f2, 0x3c1, 0x13e, 0x30d, 0x36b, 0x158, 0x097, 0x2a4, 0x2c2, 0x0f1, 0x20e, 0x03d, 0x05b, 0x268, 0x396, 0x1a5, 0x1c3, 0x3f0, 0x10f, 0x33c, 0x35a, 0x169, 0x188, 0x3bb, 0x3dd, 0x1ee, 0x311, 0x122, 0x144, 0x377, 0x289, 0x0ba, 0x0dc, 0x2ef, 0x010, 0x223, 0x245, 0x076, 0x1b9, 0x38a, 0x3ec, 0x1df, 0x320, 0x113, 0x175, 0x346, 0x2b8, 0x08b, 0x0ed, 0x2de, 0x021, 0x212, 0x274, 0x047, 0x1ea, 0x3d9, 0x3bf, 0x18c, 0x373, 0x140, 0x126, 0x315, 0x2eb, 0x0d8, 0x0be, 0x28d, 0x072, 0x241, 0x227, 0x014, 0x1db, 0x3e8, 0x38e, 0x1bd, 0x342, 0x171, 0x117, 0x324, 0x2da, 0x0e9, 0x08f, 0x2bc, 0x043, 0x270, 0x216, 0x025, 0x14c, 0x37f, 0x319, 0x12a, 0x3d5, 0x1e6, 0x180, 0x3b3, 0x24d, 0x07e, 0x018, 0x22b, 0x0d4, 0x2e7, 0x281, 0x0b2, 0x17d, 0x34e, 0x328, 0x11b, 0x3e4, 0x1d7, 0x1b1, 0x382, 0x27c, 0x04f, 0x029, 0x21a, 0x0e5, 0x2d6, 0x2b0, 0x083, 0x12e, 0x31d, 0x37b, 0x148, 0x3b7, 0x184, 0x1e2, 0x3d1, 0x22f, 0x01c, 0x07a, 0x249, 0x0b6, 0x285, 0x2e3, 0x0d0, 0x11f, 0x32c, 0x34a, 0x179, 0x386, 0x1b5, 0x1d3, 0x3e0, 0x21e, 0x02d, 0x04b, 0x278, 0x087, 0x2b4, 0x2d2, 0x0e1, }; #define CRC10_INITFCS 0x000 /* Initial FCS value */ #define CRC10_GOODFCS 0x000 /* Good final FCS value */ #define CRC10_FCS(fcs, c) ((((fcs) << 8) & 0x3ff) ^ crc10_table[((fcs) >> 2) & 0xff] ^ (c)) /** * fcs_compute10 - memcpy and calculate 10 bit CRC across buffer * @sp: pointer to buffer * @len: number of bytes * @fcs: starting FCS * * Perform a memcpy and calculate fcs using ppp 10bit CRC algorithm. Return * new 10 bit FCS. */ static inline __u16 fcs_compute10(unsigned char *sp, int len, __u16 fcs) { for (; len-- > 0; fcs = CRC10_FCS(fcs, *sp++)); return fcs; } static void safe_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; unsigned char length = urb->actual_length; int actual_length; __u16 fcs; if (!length) return; if (!safe) goto out; if (length < 2) { dev_err(&port->dev, "malformed packet\n"); return; } fcs = fcs_compute10(data, length, CRC10_INITFCS); if (fcs) { dev_err(&port->dev, "%s - bad CRC %x\n", __func__, fcs); return; } actual_length = data[length - 2] >> 2; if (actual_length > (length - 2)) { dev_err(&port->dev, "%s - inconsistent lengths %d:%d\n", __func__, actual_length, length); return; } dev_info(&urb->dev->dev, "%s - actual: %d\n", __func__, actual_length); length = actual_length; out: tty_insert_flip_string(&port->port, data, length); tty_flip_buffer_push(&port->port); } static int safe_prepare_write_buffer(struct usb_serial_port *port, void *dest, size_t size) { unsigned char *buf = dest; int count; int trailer_len; int pkt_len; __u16 fcs; trailer_len = safe ? 2 : 0; count = kfifo_out_locked(&port->write_fifo, buf, size - trailer_len, &port->lock); if (!safe) return count; /* pad if necessary */ if (padded) { pkt_len = size; memset(buf + count, '0', pkt_len - count - trailer_len); } else { pkt_len = count + trailer_len; } /* set count */ buf[pkt_len - 2] = count << 2; buf[pkt_len - 1] = 0; /* compute fcs and insert into trailer */ fcs = fcs_compute10(buf, pkt_len, CRC10_INITFCS); buf[pkt_len - 2] |= fcs >> 8; buf[pkt_len - 1] |= fcs & 0xff; return pkt_len; } static int safe_startup(struct usb_serial *serial) { struct usb_interface_descriptor *desc; if (serial->dev->descriptor.bDeviceClass != CDC_DEVICE_CLASS) return -ENODEV; desc = &serial->interface->cur_altsetting->desc; if (desc->bInterfaceClass != LINEO_INTERFACE_CLASS) return -ENODEV; if (desc->bInterfaceSubClass != LINEO_INTERFACE_SUBCLASS_SAFESERIAL) return -ENODEV; switch (desc->bInterfaceProtocol) { case LINEO_SAFESERIAL_CRC: break; case LINEO_SAFESERIAL_CRC_PADDED: padded = true; break; default: return -EINVAL; } return 0; } static struct usb_serial_driver safe_device = { .driver = { .name = "safe_serial", }, .id_table = id_table, .num_ports = 1, .process_read_urb = safe_process_read_urb, .prepare_write_buffer = safe_prepare_write_buffer, .attach = safe_startup, }; static struct usb_serial_driver * const serial_drivers[] = { &safe_device, NULL }; module_usb_serial_driver(serial_drivers, id_table); |
61 60 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-mul_3.c - MPI helper functions * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" #include "longlong.h" mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb) { mpi_limb_t cy_limb; mpi_size_t j; mpi_limb_t prod_high, prod_low; mpi_limb_t x; /* The loop counter and index J goes from -SIZE to -1. This way * the loop becomes faster. */ j = -s1_size; res_ptr -= j; s1_ptr -= j; cy_limb = 0; do { umul_ppmm(prod_high, prod_low, s1_ptr[j], s2_limb); prod_low += cy_limb; cy_limb = (prod_low < cy_limb ? 1 : 0) + prod_high; x = res_ptr[j]; prod_low = x - prod_low; cy_limb += prod_low > x ? 1 : 0; res_ptr[j] = prod_low; } while (++j); return cy_limb; } |
221 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H #include <asm/barrier.h> #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> /* * Barriers in virtio are tricky. Non-SMP virtio guests can't assume * they're not on an SMP host system, so they need to assume real * barriers. Non-SMP virtio hosts could skip the barriers, but does * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous * CPUs) we do need real barriers. In theory, we could be using both * kinds of virtio, so it's a runtime decision, and the branch is * actually quite cheap. */ static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) virt_mb(); else mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) virt_rmb(); else dma_rmb(); } static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) virt_wmb(); else dma_wmb(); } #define virtio_store_mb(weak_barriers, p, v) \ do { \ if (weak_barriers) { \ virt_store_mb(*p, v); \ } else { \ WRITE_ONCE(*p, v); \ mb(); \ } \ } while (0) \ struct virtio_device; struct virtqueue; struct device; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Creates a virtqueue and allocates the descriptor ring with per * virtqueue DMA device. */ struct virtqueue *vring_create_virtqueue_dma(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, struct device *dma_dev); /* * Creates a virtqueue with a standard layout but a caller-allocated * ring. */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Destroys a virtqueue. If created with vring_create_virtqueue, this * also frees the ring. */ void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */ |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HW_BREAKPOINT_H #define _LINUX_HW_BREAKPOINT_H #include <linux/perf_event.h> #include <uapi/linux/hw_breakpoint.h> #ifdef CONFIG_HAVE_HW_BREAKPOINT enum bp_type_idx { TYPE_INST = 0, #if defined(CONFIG_HAVE_MIXED_BREAKPOINTS_REGS) TYPE_DATA = 0, #else TYPE_DATA = 1, #endif TYPE_MAX }; extern int __init init_hw_breakpoint(void); static inline void hw_breakpoint_init(struct perf_event_attr *attr) { memset(attr, 0, sizeof(*attr)); attr->type = PERF_TYPE_BREAKPOINT; attr->size = sizeof(*attr); /* * As it's for in-kernel or ptrace use, we want it to be pinned * and to call its callback every hits. */ attr->pinned = 1; attr->sample_period = 1; } static inline void ptrace_breakpoint_init(struct perf_event_attr *attr) { hw_breakpoint_init(attr); attr->exclude_kernel = 1; } static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) { return bp->attr.bp_addr; } static inline int hw_breakpoint_type(struct perf_event *bp) { return bp->attr.bp_type; } static inline unsigned long hw_breakpoint_len(struct perf_event *bp) { return bp->attr.bp_len; } extern struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ extern int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); extern int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check); /* * Kernel breakpoints are not associated with any particular thread. */ extern struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, int cpu); extern struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); extern bool hw_breakpoint_is_used(void); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk); static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) { return &bp->hw.info; } #else /* !CONFIG_HAVE_HW_BREAKPOINT */ static inline int __init init_hw_breakpoint(void) { return 0; } static inline struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, struct task_struct *tsk) { return NULL; } static inline int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { return -ENOSYS; } static inline int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { return -ENOSYS; } static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, int cpu) { return NULL; } static inline struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { return NULL; } static inline int register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } static inline bool hw_breakpoint_is_used(void) { return false; } static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { } static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) { return NULL; } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ #endif /* _LINUX_HW_BREAKPOINT_H */ |
217 206 206 13 161 161 162 160 1 1 305 302 1 1 6 6 6 4 4 4 93 67 48 90 3 1 1 1 78 79 79 46 46 46 13 13 13 13 2 2 2 3 3 3 186 9 3 6 5 2 3 3 3 4 2 3 92 93 93 12 1 1 1 179 181 12 202 203 176 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * These functions manipulate an sctp event. The struct ulpevent is used * to carry notifications and data to the ULP (sockets). * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Ardelle Fan <ardelle.fan@intel.com> * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/skbuff.h> #include <net/sctp/structs.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); /* Initialize an ULP event from an given skb. */ static void sctp_ulpevent_init(struct sctp_ulpevent *event, __u16 msg_flags, unsigned int len) { memset(event, 0, sizeof(struct sctp_ulpevent)); event->msg_flags = msg_flags; event->rmem_len = len; } /* Create a new sctp_ulpevent. */ static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff *skb; skb = alloc_skb(size, gfp); if (!skb) goto fail; event = sctp_skb2event(skb); sctp_ulpevent_init(event, msg_flags, skb->truesize); return event; fail: return NULL; } /* Is this a MSG_NOTIFICATION? */ int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) { return MSG_NOTIFICATION == (event->msg_flags & MSG_NOTIFICATION); } /* Hold the association in case the msg_name needs read out of * the association. */ static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, const struct sctp_association *asoc) { struct sctp_chunk *chunk = event->chunk; struct sk_buff *skb; /* Cast away the const, as we are just wanting to * bump the reference count. */ sctp_association_hold((struct sctp_association *)asoc); skb = sctp_event2skb(event); event->asoc = (struct sctp_association *)asoc; atomic_add(event->rmem_len, &event->asoc->rmem_alloc); sctp_skb_set_owner_r(skb, asoc->base.sk); if (chunk && chunk->head_skb && !chunk->head_skb->sk) chunk->head_skb->sk = asoc->base.sk; } /* A simple destructor to give up the reference to the association. */ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) { struct sctp_association *asoc = event->asoc; atomic_sub(event->rmem_len, &asoc->rmem_alloc); sctp_association_put(asoc); } /* Create and initialize an SCTP_ASSOC_CHANGE event. * * 5.3.1.1 SCTP_ASSOC_CHANGE * * Communication notifications inform the ULP that an SCTP association * has either begun or ended. The identifier for a new association is * provided by this notification. * * Note: There is no field checking here. If a field is unused it will be * zero'd out. */ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( const struct sctp_association *asoc, __u16 flags, __u16 state, __u16 error, __u16 outbound, __u16 inbound, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_assoc_change *sac; struct sk_buff *skb; /* If the lower layer passed in the chunk, it will be * an ABORT, so we need to include it in the sac_info. */ if (chunk) { /* Copy the chunk data to a new skb and reserve enough * head room to use as notification. */ skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_assoc_change), 0, gfp); if (!skb) goto fail; /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); /* Include the notification structure */ sac = skb_push(skb, sizeof(struct sctp_assoc_change)); /* Trim the buffer to the right length. */ skb_trim(skb, sizeof(struct sctp_assoc_change) + ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_chunkhdr)); } else { event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); sac = skb_put(skb, sizeof(struct sctp_assoc_change)); } /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_type: * It should be SCTP_ASSOC_CHANGE. */ sac->sac_type = SCTP_ASSOC_CHANGE; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_state: 32 bits (signed integer) * This field holds one of a number of values that communicate the * event that happened to the association. */ sac->sac_state = state; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_flags: 16 bits (unsigned integer) * Currently unused. */ sac->sac_flags = 0; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_length: sizeof (__u32) * This field is the total length of the notification data, including * the notification header. */ sac->sac_length = skb->len; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_error: 32 bits (signed integer) * * If the state was reached due to a error condition (e.g. * COMMUNICATION_LOST) any relevant error information is available in * this field. This corresponds to the protocol error codes defined in * [SCTP]. */ sac->sac_error = error; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_outbound_streams: 16 bits (unsigned integer) * sac_inbound_streams: 16 bits (unsigned integer) * * The maximum number of streams allowed in each direction are * available in sac_outbound_streams and sac_inbound streams. */ sac->sac_outbound_streams = outbound; sac->sac_inbound_streams = inbound; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * sac_assoc_id: sizeof (sctp_assoc_t) * * The association id field, holds the identifier for the association. * All notifications for a given association have the same association * identifier. For TCP style socket, this field is ignored. */ sctp_ulpevent_set_owner(event, asoc); sac->sac_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* Create and initialize an SCTP_PEER_ADDR_CHANGE event. * * Socket Extensions for SCTP - draft-01 * 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * When a destination address on a multi-homed peer encounters a change * an interface details event is sent. */ static struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( const struct sctp_association *asoc, const struct sockaddr_storage *aaddr, int flags, int state, int error, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_paddr_change *spc; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_paddr_change), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); spc = skb_put(skb, sizeof(struct sctp_paddr_change)); /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_type: * * It should be SCTP_PEER_ADDR_CHANGE. */ spc->spc_type = SCTP_PEER_ADDR_CHANGE; /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_length: sizeof (__u32) * * This field is the total length of the notification data, including * the notification header. */ spc->spc_length = sizeof(struct sctp_paddr_change); /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_flags: 16 bits (unsigned integer) * Currently unused. */ spc->spc_flags = 0; /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_state: 32 bits (signed integer) * * This field holds one of a number of values that communicate the * event that happened to the address. */ spc->spc_state = state; /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_error: 32 bits (signed integer) * * If the state was reached due to any error condition (e.g. * ADDRESS_UNREACHABLE) any relevant error information is available in * this field. */ spc->spc_error = error; /* Socket Extensions for SCTP * 5.3.1.1 SCTP_ASSOC_CHANGE * * spc_assoc_id: sizeof (sctp_assoc_t) * * The association id field, holds the identifier for the association. * All notifications for a given association have the same association * identifier. For TCP style socket, this field is ignored. */ sctp_ulpevent_set_owner(event, asoc); spc->spc_assoc_id = sctp_assoc2id(asoc); /* Sockets API Extensions for SCTP * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE * * spc_aaddr: sizeof (struct sockaddr_storage) * * The affected address field, holds the remote peer's address that is * encountering the change of state. */ memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage)); /* Map ipv4 address into v4-mapped-on-v6 address. */ sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_to_user( sctp_sk(asoc->base.sk), (union sctp_addr *)&spc->spc_aaddr); return event; fail: return NULL; } void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error) { struct sctp_association *asoc = transport->asoc; struct sockaddr_storage addr; struct sctp_ulpevent *event; if (asoc->state < SCTP_STATE_ESTABLISHED) return; memset(&addr, 0, sizeof(struct sockaddr_storage)); memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len); event = sctp_ulpevent_make_peer_addr_change(asoc, &addr, 0, state, error, GFP_ATOMIC); if (event) asoc->stream.si->enqueue_event(&asoc->ulpq, event); } /* Create and initialize an SCTP_REMOTE_ERROR notification. * * Note: This assumes that the chunk->skb->data already points to the * operation error payload. * * Socket Extensions for SCTP - draft-01 * 5.3.1.3 SCTP_REMOTE_ERROR * * A remote peer may send an Operational Error message to its peer. * This message indicates a variety of error conditions on an * association. The entire error TLV as it appears on the wire is * included in a SCTP_REMOTE_ERROR event. Please refer to the SCTP * specification [SCTP] and any extensions for a list of possible * error formats. */ struct sctp_ulpevent * sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, gfp_t gfp) { struct sctp_remote_error *sre; struct sctp_ulpevent *event; struct sctp_errhdr *ch; struct sk_buff *skb; __be16 cause; int elen; ch = (struct sctp_errhdr *)(chunk->skb->data); cause = ch->cause; elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(*ch); /* Pull off the ERROR header. */ skb_pull(chunk->skb, sizeof(*ch)); /* Copy the skb to a new skb with room for us to prepend * notification with. */ skb = skb_copy_expand(chunk->skb, sizeof(*sre), 0, gfp); /* Pull off the rest of the cause TLV from the chunk. */ skb_pull(chunk->skb, elen); if (!skb) goto fail; /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); sre = skb_push(skb, sizeof(*sre)); /* Trim the buffer to the right length. */ skb_trim(skb, sizeof(*sre) + elen); /* RFC6458, Section 6.1.3. SCTP_REMOTE_ERROR */ memset(sre, 0, sizeof(*sre)); sre->sre_type = SCTP_REMOTE_ERROR; sre->sre_flags = 0; sre->sre_length = skb->len; sre->sre_error = cause; sctp_ulpevent_set_owner(event, asoc); sre->sre_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* Create and initialize a SCTP_SEND_FAILED notification. * * Socket Extensions for SCTP - draft-01 * 5.3.1.4 SCTP_SEND_FAILED */ struct sctp_ulpevent *sctp_ulpevent_make_send_failed( const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, __u32 error, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_send_failed *ssf; struct sk_buff *skb; /* Pull off any padding. */ int len = ntohs(chunk->chunk_hdr->length); /* Make skb with more room so we can prepend notification. */ skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_send_failed), /* headroom */ 0, /* tailroom */ gfp); if (!skb) goto fail; /* Pull off the common chunk header and DATA header. */ skb_pull(skb, sctp_datachk_len(&asoc->stream)); len -= sctp_datachk_len(&asoc->stream); /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); ssf = skb_push(skb, sizeof(struct sctp_send_failed)); /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_type: * It should be SCTP_SEND_FAILED. */ ssf->ssf_type = SCTP_SEND_FAILED; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_flags: 16 bits (unsigned integer) * The flag value will take one of the following values * * SCTP_DATA_UNSENT - Indicates that the data was never put on * the wire. * * SCTP_DATA_SENT - Indicates that the data was put on the wire. * Note that this does not necessarily mean that the * data was (or was not) successfully delivered. */ ssf->ssf_flags = flags; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_length: sizeof (__u32) * This field is the total length of the notification data, including * the notification header. */ ssf->ssf_length = sizeof(struct sctp_send_failed) + len; skb_trim(skb, ssf->ssf_length); /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_error: 16 bits (unsigned integer) * This value represents the reason why the send failed, and if set, * will be a SCTP protocol error code as defined in [SCTP] section * 3.3.10. */ ssf->ssf_error = error; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_info: sizeof (struct sctp_sndrcvinfo) * The original send information associated with the undelivered * message. */ memcpy(&ssf->ssf_info, &chunk->sinfo, sizeof(struct sctp_sndrcvinfo)); /* Per TSVWG discussion with Randy. Allow the application to * reassemble a fragmented message. */ ssf->ssf_info.sinfo_flags = chunk->chunk_hdr->flags; /* Socket Extensions for SCTP * 5.3.1.4 SCTP_SEND_FAILED * * ssf_assoc_id: sizeof (sctp_assoc_t) * The association id field, sf_assoc_id, holds the identifier for the * association. All notifications for a given association have the * same association identifier. For TCP style socket, this field is * ignored. */ sctp_ulpevent_set_owner(event, asoc); ssf->ssf_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } struct sctp_ulpevent *sctp_ulpevent_make_send_failed_event( const struct sctp_association *asoc, struct sctp_chunk *chunk, __u16 flags, __u32 error, gfp_t gfp) { struct sctp_send_failed_event *ssf; struct sctp_ulpevent *event; struct sk_buff *skb; int len; skb = skb_copy_expand(chunk->skb, sizeof(*ssf), 0, gfp); if (!skb) return NULL; len = ntohs(chunk->chunk_hdr->length); len -= sctp_datachk_len(&asoc->stream); skb_pull(skb, sctp_datachk_len(&asoc->stream)); event = sctp_skb2event(skb); sctp_ulpevent_init(event, MSG_NOTIFICATION, skb->truesize); ssf = skb_push(skb, sizeof(*ssf)); ssf->ssf_type = SCTP_SEND_FAILED_EVENT; ssf->ssf_flags = flags; ssf->ssf_length = sizeof(*ssf) + len; skb_trim(skb, ssf->ssf_length); ssf->ssf_error = error; ssf->ssfe_info.snd_sid = chunk->sinfo.sinfo_stream; ssf->ssfe_info.snd_ppid = chunk->sinfo.sinfo_ppid; ssf->ssfe_info.snd_context = chunk->sinfo.sinfo_context; ssf->ssfe_info.snd_assoc_id = chunk->sinfo.sinfo_assoc_id; ssf->ssfe_info.snd_flags = chunk->chunk_hdr->flags; sctp_ulpevent_set_owner(event, asoc); ssf->ssf_assoc_id = sctp_assoc2id(asoc); return event; } /* Create and initialize a SCTP_SHUTDOWN_EVENT notification. * * Socket Extensions for SCTP - draft-01 * 5.3.1.5 SCTP_SHUTDOWN_EVENT */ struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( const struct sctp_association *asoc, __u16 flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_shutdown_event *sse; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_shutdown_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); sse = skb_put(skb, sizeof(struct sctp_shutdown_event)); /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_type * It should be SCTP_SHUTDOWN_EVENT */ sse->sse_type = SCTP_SHUTDOWN_EVENT; /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_flags: 16 bits (unsigned integer) * Currently unused. */ sse->sse_flags = 0; /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_length: sizeof (__u32) * This field is the total length of the notification data, including * the notification header. */ sse->sse_length = sizeof(struct sctp_shutdown_event); /* Socket Extensions for SCTP * 5.3.1.5 SCTP_SHUTDOWN_EVENT * * sse_assoc_id: sizeof (sctp_assoc_t) * The association id field, holds the identifier for the association. * All notifications for a given association have the same association * identifier. For TCP style socket, this field is ignored. */ sctp_ulpevent_set_owner(event, asoc); sse->sse_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* Create and initialize a SCTP_ADAPTATION_INDICATION notification. * * Socket Extensions for SCTP * 5.3.1.6 SCTP_ADAPTATION_INDICATION */ struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication( const struct sctp_association *asoc, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_adaptation_event *sai; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_adaptation_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); sai = skb_put(skb, sizeof(struct sctp_adaptation_event)); sai->sai_type = SCTP_ADAPTATION_INDICATION; sai->sai_flags = 0; sai->sai_length = sizeof(struct sctp_adaptation_event); sai->sai_adaptation_ind = asoc->peer.adaptation_ind; sctp_ulpevent_set_owner(event, asoc); sai->sai_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* A message has been received. Package this message as a notification * to pass it to the upper layers. Go ahead and calculate the sndrcvinfo * even if filtered out later. * * Socket Extensions for SCTP * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) */ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event = NULL; struct sk_buff *skb = chunk->skb; struct sock *sk = asoc->base.sk; size_t padding, datalen; int rx_count; /* * check to see if we need to make space for this * new skb, expand the rcvbuffer if needed, or drop * the frame */ if (asoc->ep->rcvbuf_policy) rx_count = atomic_read(&asoc->rmem_alloc); else rx_count = atomic_read(&sk->sk_rmem_alloc); datalen = ntohs(chunk->chunk_hdr->length); if (rx_count >= sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, datalen)) goto fail; /* Clone the original skb, sharing the data. */ skb = skb_clone(chunk->skb, gfp); if (!skb) goto fail; /* Now that all memory allocations for this chunk succeeded, we * can mark it as received so the tsn_map is updated correctly. */ if (sctp_tsnmap_mark(&asoc->peer.tsn_map, ntohl(chunk->subh.data_hdr->tsn), chunk->transport)) goto fail_mark; /* First calculate the padding, so we don't inadvertently * pass up the wrong length to the user. * * RFC 2960 - Section 3.2 Chunk Field Descriptions * * The total length of a chunk(including Type, Length and Value fields) * MUST be a multiple of 4 bytes. If the length of the chunk is not a * multiple of 4 bytes, the sender MUST pad the chunk with all zero * bytes and this padding is not included in the chunk length field. * The sender should never pad with more than 3 bytes. The receiver * MUST ignore the padding bytes. */ padding = SCTP_PAD4(datalen) - datalen; /* Fixup cloned skb with just this chunks data. */ skb_trim(skb, chunk->chunk_end - padding - skb->data); /* Embed the event fields inside the cloned skb. */ event = sctp_skb2event(skb); /* Initialize event with flags 0 and correct length * Since this is a clone of the original skb, only account for * the data of this chunk as other chunks will be accounted separately. */ sctp_ulpevent_init(event, 0, skb->len + sizeof(struct sk_buff)); /* And hold the chunk as we need it for getting the IP headers * later in recvmsg */ sctp_chunk_hold(chunk); event->chunk = chunk; sctp_ulpevent_receive_data(event, asoc); event->stream = ntohs(chunk->subh.data_hdr->stream); if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { event->flags |= SCTP_UNORDERED; event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); } event->tsn = ntohl(chunk->subh.data_hdr->tsn); event->msg_flags |= chunk->chunk_hdr->flags; return event; fail_mark: kfree_skb(skb); fail: return NULL; } /* Create a partial delivery related event. * * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT * * When a receiver is engaged in a partial delivery of a * message this notification will be used to indicate * various events. */ struct sctp_ulpevent *sctp_ulpevent_make_pdapi( const struct sctp_association *asoc, __u32 indication, __u32 sid, __u32 seq, __u32 flags, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_pdapi_event *pd; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_pdapi_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); pd = skb_put(skb, sizeof(struct sctp_pdapi_event)); /* pdapi_type * It should be SCTP_PARTIAL_DELIVERY_EVENT * * pdapi_flags: 16 bits (unsigned integer) * Currently unused. */ pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; pd->pdapi_flags = flags; pd->pdapi_stream = sid; pd->pdapi_seq = seq; /* pdapi_length: 32 bits (unsigned integer) * * This field is the total length of the notification data, including * the notification header. It will generally be sizeof (struct * sctp_pdapi_event). */ pd->pdapi_length = sizeof(struct sctp_pdapi_event); /* pdapi_indication: 32 bits (unsigned integer) * * This field holds the indication being sent to the application. */ pd->pdapi_indication = indication; /* pdapi_assoc_id: sizeof (sctp_assoc_t) * * The association id field, holds the identifier for the association. */ sctp_ulpevent_set_owner(event, asoc); pd->pdapi_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } struct sctp_ulpevent *sctp_ulpevent_make_authkey( const struct sctp_association *asoc, __u16 key_id, __u32 indication, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_authkey_event *ak; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_authkey_event), MSG_NOTIFICATION, gfp); if (!event) goto fail; skb = sctp_event2skb(event); ak = skb_put(skb, sizeof(struct sctp_authkey_event)); ak->auth_type = SCTP_AUTHENTICATION_EVENT; ak->auth_flags = 0; ak->auth_length = sizeof(struct sctp_authkey_event); ak->auth_keynumber = key_id; ak->auth_altkeynumber = 0; ak->auth_indication = indication; /* * The association id field, holds the identifier for the association. */ sctp_ulpevent_set_owner(event, asoc); ak->auth_assoc_id = sctp_assoc2id(asoc); return event; fail: return NULL; } /* * Socket Extensions for SCTP * 6.3.10. SCTP_SENDER_DRY_EVENT */ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( const struct sctp_association *asoc, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_sender_dry_event *sdry; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_sender_dry_event), MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); sdry = skb_put(skb, sizeof(struct sctp_sender_dry_event)); sdry->sender_dry_type = SCTP_SENDER_DRY_EVENT; sdry->sender_dry_flags = 0; sdry->sender_dry_length = sizeof(struct sctp_sender_dry_event); sctp_ulpevent_set_owner(event, asoc); sdry->sender_dry_assoc_id = sctp_assoc2id(asoc); return event; } struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, __u16 stream_num, __be16 *stream_list, gfp_t gfp) { struct sctp_stream_reset_event *sreset; struct sctp_ulpevent *event; struct sk_buff *skb; int length, i; length = sizeof(struct sctp_stream_reset_event) + 2 * stream_num; event = sctp_ulpevent_new(length, MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); sreset = skb_put(skb, length); sreset->strreset_type = SCTP_STREAM_RESET_EVENT; sreset->strreset_flags = flags; sreset->strreset_length = length; sctp_ulpevent_set_owner(event, asoc); sreset->strreset_assoc_id = sctp_assoc2id(asoc); for (i = 0; i < stream_num; i++) sreset->strreset_stream_list[i] = ntohs(stream_list[i]); return event; } struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( const struct sctp_association *asoc, __u16 flags, __u32 local_tsn, __u32 remote_tsn, gfp_t gfp) { struct sctp_assoc_reset_event *areset; struct sctp_ulpevent *event; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_assoc_reset_event), MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); areset = skb_put(skb, sizeof(struct sctp_assoc_reset_event)); areset->assocreset_type = SCTP_ASSOC_RESET_EVENT; areset->assocreset_flags = flags; areset->assocreset_length = sizeof(struct sctp_assoc_reset_event); sctp_ulpevent_set_owner(event, asoc); areset->assocreset_assoc_id = sctp_assoc2id(asoc); areset->assocreset_local_tsn = local_tsn; areset->assocreset_remote_tsn = remote_tsn; return event; } struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( const struct sctp_association *asoc, __u16 flags, __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp) { struct sctp_stream_change_event *schange; struct sctp_ulpevent *event; struct sk_buff *skb; event = sctp_ulpevent_new(sizeof(struct sctp_stream_change_event), MSG_NOTIFICATION, gfp); if (!event) return NULL; skb = sctp_event2skb(event); schange = skb_put(skb, sizeof(struct sctp_stream_change_event)); schange->strchange_type = SCTP_STREAM_CHANGE_EVENT; schange->strchange_flags = flags; schange->strchange_length = sizeof(struct sctp_stream_change_event); sctp_ulpevent_set_owner(event, asoc); schange->strchange_assoc_id = sctp_assoc2id(asoc); schange->strchange_instrms = strchange_instrms; schange->strchange_outstrms = strchange_outstrms; return event; } /* Return the notification type, assuming this is a notification * event. */ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event) { union sctp_notification *notification; struct sk_buff *skb; skb = sctp_event2skb(event); notification = (union sctp_notification *) skb->data; return notification->sn_header.sn_type; } /* RFC6458, Section 5.3.2. SCTP Header Information Structure * (SCTP_SNDRCV, DEPRECATED) */ void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr) { struct sctp_sndrcvinfo sinfo; if (sctp_ulpevent_is_notification(event)) return; memset(&sinfo, 0, sizeof(sinfo)); sinfo.sinfo_stream = event->stream; sinfo.sinfo_ssn = event->ssn; sinfo.sinfo_ppid = event->ppid; sinfo.sinfo_flags = event->flags; sinfo.sinfo_tsn = event->tsn; sinfo.sinfo_cumtsn = event->cumtsn; sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc); /* Context value that is set via SCTP_CONTEXT socket option. */ sinfo.sinfo_context = event->asoc->default_rcv_context; /* These fields are not used while receiving. */ sinfo.sinfo_timetolive = 0; put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV, sizeof(sinfo), &sinfo); } /* RFC6458, Section 5.3.5 SCTP Receive Information Structure * (SCTP_SNDRCV) */ void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr) { struct sctp_rcvinfo rinfo; if (sctp_ulpevent_is_notification(event)) return; memset(&rinfo, 0, sizeof(struct sctp_rcvinfo)); rinfo.rcv_sid = event->stream; rinfo.rcv_ssn = event->ssn; rinfo.rcv_ppid = event->ppid; rinfo.rcv_flags = event->flags; rinfo.rcv_tsn = event->tsn; rinfo.rcv_cumtsn = event->cumtsn; rinfo.rcv_assoc_id = sctp_assoc2id(event->asoc); rinfo.rcv_context = event->asoc->default_rcv_context; put_cmsg(msghdr, IPPROTO_SCTP, SCTP_RCVINFO, sizeof(rinfo), &rinfo); } /* RFC6458, Section 5.3.6. SCTP Next Receive Information Structure * (SCTP_NXTINFO) */ static void __sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr, const struct sk_buff *skb) { struct sctp_nxtinfo nxtinfo; memset(&nxtinfo, 0, sizeof(nxtinfo)); nxtinfo.nxt_sid = event->stream; nxtinfo.nxt_ppid = event->ppid; nxtinfo.nxt_flags = event->flags; if (sctp_ulpevent_is_notification(event)) nxtinfo.nxt_flags |= SCTP_NOTIFICATION; nxtinfo.nxt_length = skb->len; nxtinfo.nxt_assoc_id = sctp_assoc2id(event->asoc); put_cmsg(msghdr, IPPROTO_SCTP, SCTP_NXTINFO, sizeof(nxtinfo), &nxtinfo); } void sctp_ulpevent_read_nxtinfo(const struct sctp_ulpevent *event, struct msghdr *msghdr, struct sock *sk) { struct sk_buff *skb; int err; skb = sctp_skb_recv_datagram(sk, MSG_PEEK | MSG_DONTWAIT, &err); if (skb != NULL) { __sctp_ulpevent_read_nxtinfo(sctp_skb2event(skb), msghdr, skb); /* Just release refcount here. */ kfree_skb(skb); } } /* Do accounting for bytes received and hold a reference to the association * for each skb. */ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc) { struct sk_buff *skb, *frag; skb = sctp_event2skb(event); /* Set the owner and charge rwnd for bytes received. */ sctp_ulpevent_set_owner(event, asoc); sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); if (!skb->data_len) return; /* Note: Not clearing the entire event struct as this is just a * fragment of the real event. However, we still need to do rwnd * accounting. * In general, the skb passed from IP can have only 1 level of * fragments. But we allow multiple levels of fragments. */ skb_walk_frags(skb, frag) sctp_ulpevent_receive_data(sctp_skb2event(frag), asoc); } /* Do accounting for bytes just read by user and release the references to * the association. */ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; unsigned int len; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as * multiple associations may be on a single UDP-style socket. * Use the local private area of the skb to track the owning * association. */ skb = sctp_event2skb(event); len = skb->len; if (!skb->data_len) goto done; /* Don't forget the fragments. */ skb_walk_frags(skb, frag) { /* NOTE: skb_shinfos are recursive. Although IP returns * skb's with only 1 level of fragments, SCTP reassembly can * increase the levels. */ sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); } done: sctp_assoc_rwnd_increase(event->asoc, len); sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; skb = sctp_event2skb(event); if (!skb->data_len) goto done; /* Don't forget the fragments. */ skb_walk_frags(skb, frag) { /* NOTE: skb_shinfos are recursive. Although IP returns * skb's with only 1 level of fragments, SCTP reassembly can * increase the levels. */ sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); } done: sctp_chunk_put(event->chunk); sctp_ulpevent_release_owner(event); } /* Free a ulpevent that has an owner. It includes releasing the reference * to the owner, updating the rwnd in case of a DATA event and freeing the * skb. */ void sctp_ulpevent_free(struct sctp_ulpevent *event) { if (sctp_ulpevent_is_notification(event)) sctp_ulpevent_release_owner(event); else sctp_ulpevent_release_data(event); kfree_skb(sctp_event2skb(event)); } /* Purge the skb lists holding ulpevents. */ unsigned int sctp_queue_purge_ulpevents(struct sk_buff_head *list) { struct sk_buff *skb; unsigned int data_unread = 0; while ((skb = skb_dequeue(list)) != NULL) { struct sctp_ulpevent *event = sctp_skb2event(skb); if (!sctp_ulpevent_is_notification(event)) data_unread += skb->len; sctp_ulpevent_free(event); } return data_unread; } |
57 56 57 57 3 5 21 57 57 57 6 20 35 8 17 56 2 2 2 2 56 56 1 9 9 9 3 3 3 1 1 9 2 2 3 2 6 6 6 3 6 8 8 4 3 2 8 1 2 2 2 2 2 2 4 4 4 4 10 27 14 24 27 27 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 | /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ipv6.h> #include <net/inet6_hashtables.h> #include <net/addrconf.h> #include "rds.h" #include "loop.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) /* converting this to RCU is a chore for another day.. */ static DEFINE_SPINLOCK(rds_conn_lock); static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, const struct in6_addr *faddr) { static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; u32 lhash, fhash, hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); #else fhash = (__force u32)faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } #define rds_conn_info_set(var, test, suffix) do { \ if (test) \ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (ipv6_addr_equal(&conn->c_faddr, faddr) && ipv6_addr_equal(&conn->c_laddr, laddr) && conn->c_trans == trans && conn->c_tos == tos && net == rds_conn_net(conn) && conn->c_dev_if == dev_if) { ret = conn; break; } } rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, laddr, faddr); return ret; } /* * This is called by transports as they're bringing down a connection. * It clears partial message state so that the transport can start sending * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ static void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; rdsdebug("connection %pI6c to %pI6c reset\n", &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); cp->cp_flags = 0; /* Do not clear next_rx_seq here, else we cannot distinguish * retransmitted packets from new packets, and will hand all * of them to the application. That is not consistent with the * reliability guarantees of RDS. */ } static void __rds_conn_path_init(struct rds_connection *conn, struct rds_conn_path *cp, bool is_outgoing) { spin_lock_init(&cp->cp_lock); cp->cp_next_tx_seq = 1; init_waitqueue_head(&cp->cp_waitq); INIT_LIST_HEAD(&cp->cp_send_queue); INIT_LIST_HEAD(&cp->cp_retrans); cp->cp_conn = conn; atomic_set(&cp->cp_state, RDS_CONN_DOWN); cp->cp_send_gen = 0; cp->cp_reconnect_jiffies = 0; cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); mutex_init(&cp->cp_cm_lock); cp->cp_flags = 0; } /* * There is only every one 'conn' for a given pair of addresses in the * system at a time. They contain messages to be retransmitted and so * span the lifetime of the actual underlying transport connections. * * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, gfp_t gfp, u8 tos, int is_outgoing, int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && ipv6_addr_equal(laddr, faddr) && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we * can stick the other QP. */ parent = conn; conn = parent->c_passive; } rcu_read_unlock(); if (conn) goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp); if (!conn->c_path) { kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-ENOMEM); goto out; } INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = *laddr; conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; conn->c_tos = tos; #if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the * index used for this connection. Otherwise, set it to 0 as * the socket is not bound to an interface. c_bound_if is used * to look up a socket when a packet is received */ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) conn->c_bound_if = dev_if; else #endif conn->c_bound_if = 0; rds_conn_net_set(conn, net); ret = rds_cong_get_maps(conn); if (ret) { kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } /* * This is where a connection becomes loopback. If *any* RDS sockets * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; if (trans->t_prefer_loopback) { if (likely(is_outgoing)) { /* "outgoing" connection to local address. * Protocol says it wants the connection * handled by the loopback transport. * This is what TCP does. */ trans = &rds_loop_transport; } else { /* No transport currently in use * should end up here, but if it * does, reset/destroy the connection. */ kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-EOPNOTSUPP); goto out; } } } conn->c_trans = trans; init_waitqueue_head(&conn->c_hs_waitq); for (i = 0; i < npaths; i++) { __rds_conn_path_init(conn, &conn->c_path[i], is_outgoing); conn->c_path[i].cp_index = i; } rcu_read_lock(); if (rds_destroy_pending(conn)) ret = -ENETDOWN; else ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", conn, laddr, faddr, strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could * have created the same conn (either normal or passive) in the * interim. We check while holding the lock. If we won, we complete * init and return our conn. If we lost, we rollback and return the * other one. */ spin_lock_irqsave(&rds_conn_lock, flags); if (parent) { /* Creating passive conn */ if (parent->c_passive) { trans->conn_free(conn->c_path[0].cp_transport_data); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { parent->c_passive = conn; rds_cong_add_conn(conn); rds_conn_count++; } } else { /* Creating normal conn */ struct rds_connection *found; found = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (found) { struct rds_conn_path *cp; int i; for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; /* The ->conn_alloc invocation may have * allocated resource for all paths, so all * of them may have to be freed here. */ if (cp->cp_transport_data) trans->conn_free(cp->cp_transport_data); } kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { conn->c_my_gen_num = rds_gen_num; conn->c_peer_gen_num = 0; hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } } spin_unlock_irqrestore(&rds_conn_lock, flags); rcu_read_unlock(); out: return conn; } struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); void rds_conn_shutdown(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; /* shut it down unless it's down already */ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { /* * Quiesce the connection mgmt handlers before we start tearing * things down. We don't hold the mutex for the entire * duration of the shutdown operation, else we may be * deadlocking with the CM handler. Instead, the CM event * handler is supposed to check for state DISCONNECTING */ mutex_lock(&cp->cp_cm_lock); if (!rds_conn_path_transition(cp, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { rds_conn_path_error(cp, "shutdown called in state %d\n", atomic_read(&cp->cp_state)); mutex_unlock(&cp->cp_cm_lock); return; } mutex_unlock(&cp->cp_cm_lock); wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); wait_event(cp->cp_waitq, !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); conn->c_trans->conn_path_shutdown(cp); rds_conn_path_reset(cp); if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. * Quite reproducible with loopback connections. * Mostly harmless. * * Note that this also happens with rds-tcp because * we could have triggered rds_conn_path_drop in irq * mode from rds_tcp_state change on the receipt of * a FIN, thus we need to recheck for RDS_CONN_ERROR * here. */ rds_conn_path_error(cp, "%s: failed to transition " "to state DOWN, current state " "is %d\n", __func__, atomic_read(&cp->cp_state)); return; } } /* Then reconnect if it's still live. * The passive side of an IB loopback connection is never added * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); rds_queue_reconnect(cp); } else { rcu_read_unlock(); } } /* destroy a single rds_conn_path. rds_conn_destroy() iterates over * all paths using rds_conn_path_destroy() */ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); rds_conn_path_drop(cp, true); flush_work(&cp->cp_down_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, &cp->cp_send_queue, m_conn_item) { list_del_init(&rm->m_conn_item); BUG_ON(!list_empty(&rm->m_sock_item)); rds_message_put(rm); } if (cp->cp_xmit_rm) rds_message_put(cp->cp_xmit_rm); WARN_ON(delayed_work_pending(&cp->cp_send_w)); WARN_ON(delayed_work_pending(&cp->cp_recv_w)); WARN_ON(delayed_work_pending(&cp->cp_conn_w)); WARN_ON(work_pending(&cp->cp_down_w)); cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } /* * Stop and free a connection. * * This can only be used in very limited circumstances. It assumes that once * the conn has been shutdown that no one else is referencing the connection. * We can only ensure this in the rmmod path in the current code. */ void rds_conn_destroy(struct rds_connection *conn) { unsigned long flags; int i; struct rds_conn_path *cp; int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); spin_unlock_irq(&rds_conn_lock); synchronize_rcu(); /* shut the connection down */ for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_destroy(cp); BUG_ON(!list_empty(&cp->cp_retrans)); } /* * The congestion maps aren't freed up here. They're * freed by rds_cong_exit() after all the connections * have been freed. */ rds_cong_remove_conn(conn); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); static void __rds_inc_msg_cp(struct rds_incoming *inc, struct rds_info_iterator *iter, void *saddr, void *daddr, int flip, bool isv6) { #if IS_ENABLED(CONFIG_IPV6) if (isv6) rds6_inc_info_copy(inc, iter, saddr, daddr, flip); else #endif rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip); } static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; unsigned int total = 0; unsigned long flags; size_t i; int j; if (isv6) len /= sizeof(struct rds6_info_message); else len /= sizeof(struct rds_info_message); rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; int npaths; if (!isv6 && conn->c_isv6) continue; npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); for (j = 0; j < npaths; j++) { cp = &conn->c_path[j]; if (want_send) list = &cp->cp_send_queue; else list = &cp->cp_retrans; spin_lock_irqsave(&cp->cp_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { total++; if (total <= len) __rds_inc_msg_cp(&rm->m_inc, iter, &conn->c_laddr, &conn->c_faddr, 0, isv6); } spin_unlock_irqrestore(&cp->cp_lock, flags); } } } rcu_read_unlock(); lens->nr = total; if (isv6) lens->each = sizeof(struct rds6_info_message); else lens->each = sizeof(struct rds_info_message); } static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } #endif static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 1); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 1); } #endif static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 0); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 0); } #endif void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_conn_path *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; /* XXX We only copy the information from the first * path for now. The problem is that if there are * more than one underlying paths, we cannot report * information of all of them using the existing * API. For example, there is only one next_tx_seq, * which path's next_tx_seq should we report? It is * a bug in the design of MPRDS. */ cp = conn->c_path; /* XXX no cp_lock usage.. */ if (!visitor(cp, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; if (conn->c_isv6) return 0; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; cinfo->tos = conn->c_tos; strncpy(cinfo->transport, conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); return 1; } #if IS_ENABLED(CONFIG_IPV6) static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds6_info_connection *cinfo6 = buffer; struct rds_connection *conn = cp->cp_conn; cinfo6->next_tx_seq = cp->cp_next_tx_seq; cinfo6->next_rx_seq = cp->cp_next_rx_seq; cinfo6->laddr = conn->c_laddr; cinfo6->faddr = conn->c_faddr; strncpy(cinfo6->transport, conn->c_trans->t_name, sizeof(cinfo6->transport)); cinfo6->flags = 0; rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); /* Just return 1 as there is no error case. This is a helper function * for rds_walk_conn_path_info() and it wants a return value. */ return 1; } #endif static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, buffer, sizeof(struct rds_info_connection)); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds6_conn_info_visitor, buffer, sizeof(struct rds6_info_connection)); } #endif int rds_conn_init(void) { int ret; ret = rds_loop_net_init(); /* register pernet callback */ if (ret) return ret; rds_conn_slab = KMEM_CACHE(rds_connection, 0); if (!rds_conn_slab) { rds_loop_net_exit(); return -ENOMEM; } rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_register_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif return 0; } void rds_conn_exit(void) { rds_loop_net_exit(); /* unregister pernet callback */ rds_loop_exit(); WARN_ON(!hlist_empty(rds_conn_hash)); kmem_cache_destroy(rds_conn_slab); rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif } /* * Force a disconnect */ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } queue_work(rds_wq, &cp->cp_down_w); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); void rds_conn_drop(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_drop(&conn->c_path[0], false); } EXPORT_SYMBOL_GPL(rds_conn_drop); /* * If the connection is down, trigger a connect. We may have scheduled a * delayed reconnect however - in this case we should not interfere. */ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); /* Check connectivity of all paths */ void rds_check_all_paths(struct rds_connection *conn) { int i = 0; do { rds_conn_path_connect_if_down(&conn->c_path[i]); } while (++i < conn->c_npaths); } void rds_conn_connect_if_down(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_connect_if_down(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); void __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintk(fmt, ap); va_end(ap); rds_conn_path_drop(cp, false); } |
129 64 125 8 4 10 70 66 64 65 63 18 7 2 9 5 5 7 2 2 12 23 21 8 8 2 6 253 227 52 22 237 218 10 3 13 11 2 13 9 4 3 3 14 5 9 6 1 14 14 12 2 14 14 14 6 8 5 90 54 34 42 90 90 72 72 4 5 3 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 | // SPDX-License-Identifier: GPL-2.0-or-later /* * USB Network driver infrastructure * Copyright (C) 2000-2005 by David Brownell * Copyright (C) 2003-2005 David Hollis <dhollis@davehollis.com> */ /* * This is a generic "USB networking" framework that works with several * kinds of full and high speed networking devices: host-to-host cables, * smart usb peripherals, and actual Ethernet adapters. * * These devices usually differ in terms of control protocols (if they * even have one!) and sometimes they define new framing to wrap or batch * Ethernet packets. Otherwise, they talk to USB pretty much the same, * so interface (un)binding, endpoint I/O queues, fault handling, and other * issues can usefully be addressed by this framework. */ #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ctype.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/pm_runtime.h> /*-------------------------------------------------------------------------*/ /* * Nineteen USB 1.1 max size bulk transactions per frame (ms), max. * Several dozen bytes of IPv4 data can fit in two such transactions. * One maximum size Ethernet packet takes twenty four of them. * For high speed, each frame comfortably fits almost 36 max size * Ethernet packets (so queues should be bigger). * * The goal is to let the USB host controller be busy for 5msec or * more before an irq is required, under load. Jumbograms change * the equation. */ #define MAX_QUEUE_MEMORY (60 * 1518) #define RX_QLEN(dev) ((dev)->rx_qlen) #define TX_QLEN(dev) ((dev)->tx_qlen) // reawaken network queue this soon after stopping; else watchdog barks #define TX_TIMEOUT_JIFFIES (5*HZ) /* throttle rx/tx briefly after some faults, so hub_wq might disconnect() * us (it polls at HZ/4 usually) before we report too many false errors. */ #define THROTTLE_JIFFIES (HZ/8) // between wakeups #define UNLINK_TIMEOUT_MS 3 /*-------------------------------------------------------------------------*/ /* use ethtool to change the level for any given device */ static int msg_level = -1; module_param (msg_level, int, 0); MODULE_PARM_DESC (msg_level, "Override default message level"); /*-------------------------------------------------------------------------*/ static const char * const usbnet_event_names[] = { [EVENT_TX_HALT] = "EVENT_TX_HALT", [EVENT_RX_HALT] = "EVENT_RX_HALT", [EVENT_RX_MEMORY] = "EVENT_RX_MEMORY", [EVENT_STS_SPLIT] = "EVENT_STS_SPLIT", [EVENT_LINK_RESET] = "EVENT_LINK_RESET", [EVENT_RX_PAUSED] = "EVENT_RX_PAUSED", [EVENT_DEV_ASLEEP] = "EVENT_DEV_ASLEEP", [EVENT_DEV_OPEN] = "EVENT_DEV_OPEN", [EVENT_DEVICE_REPORT_IDLE] = "EVENT_DEVICE_REPORT_IDLE", [EVENT_NO_RUNTIME_PM] = "EVENT_NO_RUNTIME_PM", [EVENT_RX_KILL] = "EVENT_RX_KILL", [EVENT_LINK_CHANGE] = "EVENT_LINK_CHANGE", [EVENT_SET_RX_MODE] = "EVENT_SET_RX_MODE", [EVENT_NO_IP_ALIGN] = "EVENT_NO_IP_ALIGN", }; /* handles CDC Ethernet and many other network "bulk data" interfaces */ int usbnet_get_endpoints(struct usbnet *dev, struct usb_interface *intf) { int tmp; struct usb_host_interface *alt = NULL; struct usb_host_endpoint *in = NULL, *out = NULL; struct usb_host_endpoint *status = NULL; for (tmp = 0; tmp < intf->num_altsetting; tmp++) { unsigned ep; in = out = status = NULL; alt = intf->altsetting + tmp; /* take the first altsetting with in-bulk + out-bulk; * remember any status endpoint, just in case; * ignore other endpoints and altsettings. */ for (ep = 0; ep < alt->desc.bNumEndpoints; ep++) { struct usb_host_endpoint *e; int intr = 0; e = alt->endpoint + ep; /* ignore endpoints which cannot transfer data */ if (!usb_endpoint_maxp(&e->desc)) continue; switch (e->desc.bmAttributes) { case USB_ENDPOINT_XFER_INT: if (!usb_endpoint_dir_in(&e->desc)) continue; intr = 1; fallthrough; case USB_ENDPOINT_XFER_BULK: break; default: continue; } if (usb_endpoint_dir_in(&e->desc)) { if (!intr && !in) in = e; else if (intr && !status) status = e; } else { if (!out) out = e; } } if (in && out) break; } if (!alt || !in || !out) return -EINVAL; if (alt->desc.bAlternateSetting != 0 || !(dev->driver_info->flags & FLAG_NO_SETINT)) { tmp = usb_set_interface (dev->udev, alt->desc.bInterfaceNumber, alt->desc.bAlternateSetting); if (tmp < 0) return tmp; } dev->in = usb_rcvbulkpipe (dev->udev, in->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); dev->out = usb_sndbulkpipe (dev->udev, out->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); dev->status = status; return 0; } EXPORT_SYMBOL_GPL(usbnet_get_endpoints); int usbnet_get_ethernet_addr(struct usbnet *dev, int iMACAddress) { u8 addr[ETH_ALEN]; int tmp = -1, ret; unsigned char buf [13]; ret = usb_string(dev->udev, iMACAddress, buf, sizeof buf); if (ret == 12) tmp = hex2bin(addr, buf, 6); if (tmp < 0) { dev_dbg(&dev->udev->dev, "bad MAC string %d fetch, %d\n", iMACAddress, tmp); if (ret >= 0) ret = -EINVAL; return ret; } eth_hw_addr_set(dev->net, addr); return 0; } EXPORT_SYMBOL_GPL(usbnet_get_ethernet_addr); static void intr_complete (struct urb *urb) { struct usbnet *dev = urb->context; int status = urb->status; switch (status) { /* success */ case 0: dev->driver_info->status(dev, urb); break; /* software-driven interface shutdown */ case -ENOENT: /* urb killed */ case -ESHUTDOWN: /* hardware gone */ netif_dbg(dev, ifdown, dev->net, "intr shutdown, code %d\n", status); return; /* NOTE: not throttling like RX/TX, since this endpoint * already polls infrequently */ default: netdev_dbg(dev->net, "intr status %d\n", status); break; } status = usb_submit_urb (urb, GFP_ATOMIC); if (status != 0) netif_err(dev, timer, dev->net, "intr resubmit --> %d\n", status); } static int init_status (struct usbnet *dev, struct usb_interface *intf) { char *buf = NULL; unsigned pipe = 0; unsigned maxp; unsigned period; if (!dev->driver_info->status) return 0; pipe = usb_rcvintpipe (dev->udev, dev->status->desc.bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); maxp = usb_maxpacket(dev->udev, pipe); /* avoid 1 msec chatter: min 8 msec poll rate */ period = max ((int) dev->status->desc.bInterval, (dev->udev->speed == USB_SPEED_HIGH) ? 7 : 3); buf = kmalloc (maxp, GFP_KERNEL); if (buf) { dev->interrupt = usb_alloc_urb (0, GFP_KERNEL); if (!dev->interrupt) { kfree (buf); return -ENOMEM; } else { usb_fill_int_urb(dev->interrupt, dev->udev, pipe, buf, maxp, intr_complete, dev, period); dev->interrupt->transfer_flags |= URB_FREE_BUFFER; dev_dbg(&intf->dev, "status ep%din, %d bytes period %d\n", usb_pipeendpoint(pipe), maxp, period); } } return 0; } /* Submit the interrupt URB if not previously submitted, increasing refcount */ int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags) { int ret = 0; WARN_ON_ONCE(dev->interrupt == NULL); if (dev->interrupt) { mutex_lock(&dev->interrupt_mutex); if (++dev->interrupt_count == 1) ret = usb_submit_urb(dev->interrupt, mem_flags); dev_dbg(&dev->udev->dev, "incremented interrupt URB count to %d\n", dev->interrupt_count); mutex_unlock(&dev->interrupt_mutex); } return ret; } EXPORT_SYMBOL_GPL(usbnet_status_start); /* For resume; submit interrupt URB if previously submitted */ static int __usbnet_status_start_force(struct usbnet *dev, gfp_t mem_flags) { int ret = 0; mutex_lock(&dev->interrupt_mutex); if (dev->interrupt_count) { ret = usb_submit_urb(dev->interrupt, mem_flags); dev_dbg(&dev->udev->dev, "submitted interrupt URB for resume\n"); } mutex_unlock(&dev->interrupt_mutex); return ret; } /* Kill the interrupt URB if all submitters want it killed */ void usbnet_status_stop(struct usbnet *dev) { if (dev->interrupt) { mutex_lock(&dev->interrupt_mutex); WARN_ON(dev->interrupt_count == 0); if (dev->interrupt_count && --dev->interrupt_count == 0) usb_kill_urb(dev->interrupt); dev_dbg(&dev->udev->dev, "decremented interrupt URB count to %d\n", dev->interrupt_count); mutex_unlock(&dev->interrupt_mutex); } } EXPORT_SYMBOL_GPL(usbnet_status_stop); /* For suspend; always kill interrupt URB */ static void __usbnet_status_stop_force(struct usbnet *dev) { if (dev->interrupt) { mutex_lock(&dev->interrupt_mutex); usb_kill_urb(dev->interrupt); dev_dbg(&dev->udev->dev, "killed interrupt URB for suspend\n"); mutex_unlock(&dev->interrupt_mutex); } } /* Passes this packet up the stack, updating its accounting. * Some link protocols batch packets, so their rx_fixup paths * can return clones as well as just modify the original skb. */ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats); unsigned long flags; int status; if (test_bit(EVENT_RX_PAUSED, &dev->flags)) { skb_queue_tail(&dev->rxq_pause, skb); return; } /* only update if unset to allow minidriver rx_fixup override */ if (skb->protocol == 0) skb->protocol = eth_type_trans (skb, dev->net); flags = u64_stats_update_begin_irqsave(&stats64->syncp); u64_stats_inc(&stats64->rx_packets); u64_stats_add(&stats64->rx_bytes, skb->len); u64_stats_update_end_irqrestore(&stats64->syncp, flags); netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n", skb->len + sizeof (struct ethhdr), skb->protocol); memset (skb->cb, 0, sizeof (struct skb_data)); if (skb_defer_rx_timestamp(skb)) return; status = netif_rx (skb); if (status != NET_RX_SUCCESS) netif_dbg(dev, rx_err, dev->net, "netif_rx status %d\n", status); } EXPORT_SYMBOL_GPL(usbnet_skb_return); /* must be called if hard_mtu or rx_urb_size changed */ void usbnet_update_max_qlen(struct usbnet *dev) { enum usb_device_speed speed = dev->udev->speed; if (!dev->rx_urb_size || !dev->hard_mtu) goto insanity; switch (speed) { case USB_SPEED_HIGH: dev->rx_qlen = MAX_QUEUE_MEMORY / dev->rx_urb_size; dev->tx_qlen = MAX_QUEUE_MEMORY / dev->hard_mtu; break; case USB_SPEED_SUPER: case USB_SPEED_SUPER_PLUS: /* * Not take default 5ms qlen for super speed HC to * save memory, and iperf tests show 2.5ms qlen can * work well */ dev->rx_qlen = 5 * MAX_QUEUE_MEMORY / dev->rx_urb_size; dev->tx_qlen = 5 * MAX_QUEUE_MEMORY / dev->hard_mtu; break; default: insanity: dev->rx_qlen = dev->tx_qlen = 4; } } EXPORT_SYMBOL_GPL(usbnet_update_max_qlen); /*------------------------------------------------------------------------- * * Network Device Driver (peer link to "Host Device", from USB host) * *-------------------------------------------------------------------------*/ int usbnet_change_mtu (struct net_device *net, int new_mtu) { struct usbnet *dev = netdev_priv(net); int ll_mtu = new_mtu + net->hard_header_len; int old_hard_mtu = dev->hard_mtu; int old_rx_urb_size = dev->rx_urb_size; // no second zero-length packet read wanted after mtu-sized packets if ((ll_mtu % dev->maxpacket) == 0) return -EDOM; WRITE_ONCE(net->mtu, new_mtu); dev->hard_mtu = net->mtu + net->hard_header_len; if (dev->rx_urb_size == old_hard_mtu) { dev->rx_urb_size = dev->hard_mtu; if (dev->rx_urb_size > old_rx_urb_size) { usbnet_pause_rx(dev); usbnet_unlink_rx_urbs(dev); usbnet_resume_rx(dev); } } /* max qlen depend on hard_mtu and rx_urb_size */ usbnet_update_max_qlen(dev); return 0; } EXPORT_SYMBOL_GPL(usbnet_change_mtu); /* The caller must hold list->lock */ static void __usbnet_queue_skb(struct sk_buff_head *list, struct sk_buff *newsk, enum skb_state state) { struct skb_data *entry = (struct skb_data *) newsk->cb; __skb_queue_tail(list, newsk); entry->state = state; } /*-------------------------------------------------------------------------*/ /* some LK 2.4 HCDs oopsed if we freed or resubmitted urbs from * completion callbacks. 2.5 should have fixed those bugs... */ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb, struct sk_buff_head *list, enum skb_state state) { unsigned long flags; enum skb_state old_state; struct skb_data *entry = (struct skb_data *) skb->cb; spin_lock_irqsave(&list->lock, flags); old_state = entry->state; entry->state = state; __skb_unlink(skb, list); /* defer_bh() is never called with list == &dev->done. * spin_lock_nested() tells lockdep that it is OK to take * dev->done.lock here with list->lock held. */ spin_lock_nested(&dev->done.lock, SINGLE_DEPTH_NESTING); __skb_queue_tail(&dev->done, skb); if (dev->done.qlen == 1) tasklet_schedule(&dev->bh); spin_unlock(&dev->done.lock); spin_unlock_irqrestore(&list->lock, flags); return old_state; } /* some work can't be done in tasklets, so we use keventd * * NOTE: annoying asymmetry: if it's active, schedule_work() fails, * but tasklet_schedule() doesn't. hope the failure is rare. */ void usbnet_defer_kevent (struct usbnet *dev, int work) { set_bit (work, &dev->flags); if (!usbnet_going_away(dev)) { if (!schedule_work(&dev->kevent)) netdev_dbg(dev->net, "kevent %s may have been dropped\n", usbnet_event_names[work]); else netdev_dbg(dev->net, "kevent %s scheduled\n", usbnet_event_names[work]); } } EXPORT_SYMBOL_GPL(usbnet_defer_kevent); /*-------------------------------------------------------------------------*/ static void rx_complete (struct urb *urb); static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags) { struct sk_buff *skb; struct skb_data *entry; int retval = 0; unsigned long lockflags; size_t size = dev->rx_urb_size; /* prevent rx skb allocation when error ratio is high */ if (test_bit(EVENT_RX_KILL, &dev->flags)) { usb_free_urb(urb); return -ENOLINK; } if (test_bit(EVENT_NO_IP_ALIGN, &dev->flags)) skb = __netdev_alloc_skb(dev->net, size, flags); else skb = __netdev_alloc_skb_ip_align(dev->net, size, flags); if (!skb) { netif_dbg(dev, rx_err, dev->net, "no rx skb\n"); usbnet_defer_kevent (dev, EVENT_RX_MEMORY); usb_free_urb (urb); return -ENOMEM; } entry = (struct skb_data *) skb->cb; entry->urb = urb; entry->dev = dev; entry->length = 0; usb_fill_bulk_urb (urb, dev->udev, dev->in, skb->data, size, rx_complete, skb); spin_lock_irqsave (&dev->rxq.lock, lockflags); if (netif_running (dev->net) && netif_device_present (dev->net) && test_bit(EVENT_DEV_OPEN, &dev->flags) && !test_bit (EVENT_RX_HALT, &dev->flags) && !test_bit (EVENT_DEV_ASLEEP, &dev->flags)) { switch (retval = usb_submit_urb (urb, GFP_ATOMIC)) { case -EPIPE: usbnet_defer_kevent (dev, EVENT_RX_HALT); break; case -ENOMEM: usbnet_defer_kevent (dev, EVENT_RX_MEMORY); break; case -ENODEV: netif_dbg(dev, ifdown, dev->net, "device gone\n"); netif_device_detach (dev->net); break; case -EHOSTUNREACH: retval = -ENOLINK; break; default: netif_dbg(dev, rx_err, dev->net, "rx submit, %d\n", retval); tasklet_schedule (&dev->bh); break; case 0: if (!usbnet_going_away(dev)) __usbnet_queue_skb(&dev->rxq, skb, rx_start); } } else { netif_dbg(dev, ifdown, dev->net, "rx: stopped\n"); retval = -ENOLINK; } spin_unlock_irqrestore (&dev->rxq.lock, lockflags); if (retval) { dev_kfree_skb_any (skb); usb_free_urb (urb); } return retval; } /*-------------------------------------------------------------------------*/ static inline int rx_process(struct usbnet *dev, struct sk_buff *skb) { if (dev->driver_info->rx_fixup && !dev->driver_info->rx_fixup (dev, skb)) { /* With RX_ASSEMBLE, rx_fixup() must update counters */ if (!(dev->driver_info->flags & FLAG_RX_ASSEMBLE)) dev->net->stats.rx_errors++; return -EPROTO; } // else network stack removes extra byte if we forced a short packet /* all data was already cloned from skb inside the driver */ if (dev->driver_info->flags & FLAG_MULTI_PACKET) return -EALREADY; if (skb->len < ETH_HLEN) { dev->net->stats.rx_errors++; dev->net->stats.rx_length_errors++; netif_dbg(dev, rx_err, dev->net, "rx length %d\n", skb->len); return -EPROTO; } usbnet_skb_return(dev, skb); return 0; } /*-------------------------------------------------------------------------*/ static void rx_complete (struct urb *urb) { struct sk_buff *skb = (struct sk_buff *) urb->context; struct skb_data *entry = (struct skb_data *) skb->cb; struct usbnet *dev = entry->dev; int urb_status = urb->status; enum skb_state state; skb_put (skb, urb->actual_length); state = rx_done; entry->urb = NULL; switch (urb_status) { /* success */ case 0: break; /* stalls need manual reset. this is rare ... except that * when going through USB 2.0 TTs, unplug appears this way. * we avoid the highspeed version of the ETIMEDOUT/EILSEQ * storm, recovering as needed. */ case -EPIPE: dev->net->stats.rx_errors++; usbnet_defer_kevent (dev, EVENT_RX_HALT); fallthrough; /* software-driven interface shutdown */ case -ECONNRESET: /* async unlink */ case -ESHUTDOWN: /* hardware gone */ netif_dbg(dev, ifdown, dev->net, "rx shutdown, code %d\n", urb_status); goto block; /* we get controller i/o faults during hub_wq disconnect() delays. * throttle down resubmits, to avoid log floods; just temporarily, * so we still recover when the fault isn't a hub_wq delay. */ case -EPROTO: case -ETIME: case -EILSEQ: dev->net->stats.rx_errors++; if (!timer_pending (&dev->delay)) { mod_timer (&dev->delay, jiffies + THROTTLE_JIFFIES); netif_dbg(dev, link, dev->net, "rx throttle %d\n", urb_status); } block: state = rx_cleanup; entry->urb = urb; urb = NULL; break; /* data overrun ... flush fifo? */ case -EOVERFLOW: dev->net->stats.rx_over_errors++; fallthrough; default: state = rx_cleanup; dev->net->stats.rx_errors++; netif_dbg(dev, rx_err, dev->net, "rx status %d\n", urb_status); break; } /* stop rx if packet error rate is high */ if (++dev->pkt_cnt > 30) { dev->pkt_cnt = 0; dev->pkt_err = 0; } else { if (state == rx_cleanup) dev->pkt_err++; if (dev->pkt_err > 20) set_bit(EVENT_RX_KILL, &dev->flags); } state = defer_bh(dev, skb, &dev->rxq, state); if (urb) { if (netif_running (dev->net) && !test_bit (EVENT_RX_HALT, &dev->flags) && state != unlink_start) { rx_submit (dev, urb, GFP_ATOMIC); usb_mark_last_busy(dev->udev); return; } usb_free_urb (urb); } netif_dbg(dev, rx_err, dev->net, "no read resubmitted\n"); } /*-------------------------------------------------------------------------*/ void usbnet_pause_rx(struct usbnet *dev) { set_bit(EVENT_RX_PAUSED, &dev->flags); netif_dbg(dev, rx_status, dev->net, "paused rx queue enabled\n"); } EXPORT_SYMBOL_GPL(usbnet_pause_rx); void usbnet_resume_rx(struct usbnet *dev) { struct sk_buff *skb; int num = 0; clear_bit(EVENT_RX_PAUSED, &dev->flags); while ((skb = skb_dequeue(&dev->rxq_pause)) != NULL) { usbnet_skb_return(dev, skb); num++; } tasklet_schedule(&dev->bh); netif_dbg(dev, rx_status, dev->net, "paused rx queue disabled, %d skbs requeued\n", num); } EXPORT_SYMBOL_GPL(usbnet_resume_rx); void usbnet_purge_paused_rxq(struct usbnet *dev) { skb_queue_purge(&dev->rxq_pause); } EXPORT_SYMBOL_GPL(usbnet_purge_paused_rxq); /*-------------------------------------------------------------------------*/ // unlink pending rx/tx; completion handlers do all other cleanup static int unlink_urbs (struct usbnet *dev, struct sk_buff_head *q) { unsigned long flags; struct sk_buff *skb; int count = 0; spin_lock_irqsave (&q->lock, flags); while (!skb_queue_empty(q)) { struct skb_data *entry; struct urb *urb; int retval; skb_queue_walk(q, skb) { entry = (struct skb_data *) skb->cb; if (entry->state != unlink_start) goto found; } break; found: entry->state = unlink_start; urb = entry->urb; /* * Get reference count of the URB to avoid it to be * freed during usb_unlink_urb, which may trigger * use-after-free problem inside usb_unlink_urb since * usb_unlink_urb is always racing with .complete * handler(include defer_bh). */ usb_get_urb(urb); spin_unlock_irqrestore(&q->lock, flags); // during some PM-driven resume scenarios, // these (async) unlinks complete immediately retval = usb_unlink_urb (urb); if (retval != -EINPROGRESS && retval != 0) netdev_dbg(dev->net, "unlink urb err, %d\n", retval); else count++; usb_put_urb(urb); spin_lock_irqsave(&q->lock, flags); } spin_unlock_irqrestore (&q->lock, flags); return count; } // Flush all pending rx urbs // minidrivers may need to do this when the MTU changes void usbnet_unlink_rx_urbs(struct usbnet *dev) { if (netif_running(dev->net)) { (void) unlink_urbs (dev, &dev->rxq); tasklet_schedule(&dev->bh); } } EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs); /*-------------------------------------------------------------------------*/ static void wait_skb_queue_empty(struct sk_buff_head *q) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); while (!skb_queue_empty(q)) { spin_unlock_irqrestore(&q->lock, flags); schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); set_current_state(TASK_UNINTERRUPTIBLE); spin_lock_irqsave(&q->lock, flags); } spin_unlock_irqrestore(&q->lock, flags); } // precondition: never called in_interrupt static void usbnet_terminate_urbs(struct usbnet *dev) { DECLARE_WAITQUEUE(wait, current); int temp; /* ensure there are no more active urbs */ add_wait_queue(&dev->wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); temp = unlink_urbs(dev, &dev->txq) + unlink_urbs(dev, &dev->rxq); /* maybe wait for deletions to finish. */ wait_skb_queue_empty(&dev->rxq); wait_skb_queue_empty(&dev->txq); wait_skb_queue_empty(&dev->done); netif_dbg(dev, ifdown, dev->net, "waited for %d urb completions\n", temp); set_current_state(TASK_RUNNING); remove_wait_queue(&dev->wait, &wait); } int usbnet_stop (struct net_device *net) { struct usbnet *dev = netdev_priv(net); const struct driver_info *info = dev->driver_info; int retval, pm, mpn; clear_bit(EVENT_DEV_OPEN, &dev->flags); netif_stop_queue (net); netif_info(dev, ifdown, dev->net, "stop stats: rx/tx %lu/%lu, errs %lu/%lu\n", net->stats.rx_packets, net->stats.tx_packets, net->stats.rx_errors, net->stats.tx_errors); /* to not race resume */ pm = usb_autopm_get_interface(dev->intf); /* allow minidriver to stop correctly (wireless devices to turn off * radio etc) */ if (info->stop) { retval = info->stop(dev); if (retval < 0) netif_info(dev, ifdown, dev->net, "stop fail (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); } if (!(info->flags & FLAG_AVOID_UNLINK_URBS)) usbnet_terminate_urbs(dev); usbnet_status_stop(dev); usbnet_purge_paused_rxq(dev); mpn = !test_and_clear_bit(EVENT_NO_RUNTIME_PM, &dev->flags); /* deferred work (timer, softirq, task) must also stop */ dev->flags = 0; del_timer_sync(&dev->delay); tasklet_kill(&dev->bh); cancel_work_sync(&dev->kevent); /* We have cyclic dependencies. Those calls are needed * to break a cycle. We cannot fall into the gaps because * we have a flag */ tasklet_kill(&dev->bh); del_timer_sync(&dev->delay); cancel_work_sync(&dev->kevent); if (!pm) usb_autopm_put_interface(dev->intf); if (info->manage_power && mpn) info->manage_power(dev, 0); else usb_autopm_put_interface(dev->intf); return 0; } EXPORT_SYMBOL_GPL(usbnet_stop); /*-------------------------------------------------------------------------*/ // posts reads, and enables write queuing // precondition: never called in_interrupt int usbnet_open (struct net_device *net) { struct usbnet *dev = netdev_priv(net); int retval; const struct driver_info *info = dev->driver_info; if ((retval = usb_autopm_get_interface(dev->intf)) < 0) { netif_info(dev, ifup, dev->net, "resumption fail (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); goto done_nopm; } // put into "known safe" state if (info->reset && (retval = info->reset (dev)) < 0) { netif_info(dev, ifup, dev->net, "open reset fail (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); goto done; } /* hard_mtu or rx_urb_size may change in reset() */ usbnet_update_max_qlen(dev); // insist peer be connected if (info->check_connect && (retval = info->check_connect (dev)) < 0) { netif_err(dev, ifup, dev->net, "can't open; %d\n", retval); goto done; } /* start any status interrupt transfer */ if (dev->interrupt) { retval = usbnet_status_start(dev, GFP_KERNEL); if (retval < 0) { netif_err(dev, ifup, dev->net, "intr submit %d\n", retval); goto done; } } set_bit(EVENT_DEV_OPEN, &dev->flags); netif_start_queue (net); netif_info(dev, ifup, dev->net, "open: enable queueing (rx %d, tx %d) mtu %d %s framing\n", (int)RX_QLEN(dev), (int)TX_QLEN(dev), dev->net->mtu, (dev->driver_info->flags & FLAG_FRAMING_NC) ? "NetChip" : (dev->driver_info->flags & FLAG_FRAMING_GL) ? "GeneSys" : (dev->driver_info->flags & FLAG_FRAMING_Z) ? "Zaurus" : (dev->driver_info->flags & FLAG_FRAMING_RN) ? "RNDIS" : (dev->driver_info->flags & FLAG_FRAMING_AX) ? "ASIX" : "simple"); /* reset rx error state */ dev->pkt_cnt = 0; dev->pkt_err = 0; clear_bit(EVENT_RX_KILL, &dev->flags); // delay posting reads until we're fully open tasklet_schedule (&dev->bh); if (info->manage_power) { retval = info->manage_power(dev, 1); if (retval < 0) { retval = 0; set_bit(EVENT_NO_RUNTIME_PM, &dev->flags); } else { usb_autopm_put_interface(dev->intf); } } return retval; done: usb_autopm_put_interface(dev->intf); done_nopm: return retval; } EXPORT_SYMBOL_GPL(usbnet_open); /*-------------------------------------------------------------------------*/ /* ethtool methods; minidrivers may need to add some more, but * they'll probably want to use this base set. */ /* These methods are written on the assumption that the device * uses MII */ int usbnet_get_link_ksettings_mii(struct net_device *net, struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); if (!dev->mii.mdio_read) return -EOPNOTSUPP; mii_ethtool_get_link_ksettings(&dev->mii, cmd); return 0; } EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_mii); int usbnet_get_link_ksettings_internal(struct net_device *net, struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); /* the assumption that speed is equal on tx and rx * is deeply engrained into the networking layer. * For wireless stuff it is not true. * We assume that rx_speed matters more. */ if (dev->rx_speed != SPEED_UNSET) cmd->base.speed = dev->rx_speed / 1000000; else if (dev->tx_speed != SPEED_UNSET) cmd->base.speed = dev->tx_speed / 1000000; else cmd->base.speed = SPEED_UNKNOWN; return 0; } EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_internal); int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); int retval; if (!dev->mii.mdio_write) return -EOPNOTSUPP; retval = mii_ethtool_set_link_ksettings(&dev->mii, cmd); /* link speed/duplex might have changed */ if (dev->driver_info->link_reset) dev->driver_info->link_reset(dev); /* hard_mtu or rx_urb_size may change in link_reset() */ usbnet_update_max_qlen(dev); return retval; } EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings_mii); u32 usbnet_get_link (struct net_device *net) { struct usbnet *dev = netdev_priv(net); /* If a check_connect is defined, return its result */ if (dev->driver_info->check_connect) return dev->driver_info->check_connect (dev) == 0; /* if the device has mii operations, use those */ if (dev->mii.mdio_read) return mii_link_ok(&dev->mii); /* Otherwise, dtrt for drivers calling netif_carrier_{on,off} */ return ethtool_op_get_link(net); } EXPORT_SYMBOL_GPL(usbnet_get_link); int usbnet_nway_reset(struct net_device *net) { struct usbnet *dev = netdev_priv(net); if (!dev->mii.mdio_write) return -EOPNOTSUPP; return mii_nway_restart(&dev->mii); } EXPORT_SYMBOL_GPL(usbnet_nway_reset); void usbnet_get_drvinfo (struct net_device *net, struct ethtool_drvinfo *info) { struct usbnet *dev = netdev_priv(net); strscpy(info->driver, dev->driver_name, sizeof(info->driver)); strscpy(info->fw_version, dev->driver_info->description, sizeof(info->fw_version)); usb_make_path (dev->udev, info->bus_info, sizeof info->bus_info); } EXPORT_SYMBOL_GPL(usbnet_get_drvinfo); u32 usbnet_get_msglevel (struct net_device *net) { struct usbnet *dev = netdev_priv(net); return dev->msg_enable; } EXPORT_SYMBOL_GPL(usbnet_get_msglevel); void usbnet_set_msglevel (struct net_device *net, u32 level) { struct usbnet *dev = netdev_priv(net); dev->msg_enable = level; } EXPORT_SYMBOL_GPL(usbnet_set_msglevel); /* drivers may override default ethtool_ops in their bind() routine */ static const struct ethtool_ops usbnet_ethtool_ops = { .get_link = usbnet_get_link, .nway_reset = usbnet_nway_reset, .get_drvinfo = usbnet_get_drvinfo, .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_ts_info = ethtool_op_get_ts_info, .get_link_ksettings = usbnet_get_link_ksettings_mii, .set_link_ksettings = usbnet_set_link_ksettings_mii, }; /*-------------------------------------------------------------------------*/ static void __handle_link_change(struct usbnet *dev) { if (!test_bit(EVENT_DEV_OPEN, &dev->flags)) return; if (!netif_carrier_ok(dev->net)) { /* kill URBs for reading packets to save bus bandwidth */ unlink_urbs(dev, &dev->rxq); /* * tx_timeout will unlink URBs for sending packets and * tx queue is stopped by netcore after link becomes off */ } else { /* submitting URBs for reading packets */ tasklet_schedule(&dev->bh); } /* hard_mtu or rx_urb_size may change during link change */ usbnet_update_max_qlen(dev); clear_bit(EVENT_LINK_CHANGE, &dev->flags); } void usbnet_set_rx_mode(struct net_device *net) { struct usbnet *dev = netdev_priv(net); usbnet_defer_kevent(dev, EVENT_SET_RX_MODE); } EXPORT_SYMBOL_GPL(usbnet_set_rx_mode); static void __handle_set_rx_mode(struct usbnet *dev) { if (dev->driver_info->set_rx_mode) (dev->driver_info->set_rx_mode)(dev); clear_bit(EVENT_SET_RX_MODE, &dev->flags); } /* work that cannot be done in interrupt context uses keventd. * * NOTE: with 2.5 we could do more of this using completion callbacks, * especially now that control transfers can be queued. */ static void usbnet_deferred_kevent (struct work_struct *work) { struct usbnet *dev = container_of(work, struct usbnet, kevent); int status; /* usb_clear_halt() needs a thread context */ if (test_bit (EVENT_TX_HALT, &dev->flags)) { unlink_urbs (dev, &dev->txq); status = usb_autopm_get_interface(dev->intf); if (status < 0) goto fail_pipe; status = usb_clear_halt (dev->udev, dev->out); usb_autopm_put_interface(dev->intf); if (status < 0 && status != -EPIPE && status != -ESHUTDOWN) { if (netif_msg_tx_err (dev)) fail_pipe: netdev_err(dev->net, "can't clear tx halt, status %d\n", status); } else { clear_bit (EVENT_TX_HALT, &dev->flags); if (status != -ESHUTDOWN) netif_wake_queue (dev->net); } } if (test_bit (EVENT_RX_HALT, &dev->flags)) { unlink_urbs (dev, &dev->rxq); status = usb_autopm_get_interface(dev->intf); if (status < 0) goto fail_halt; status = usb_clear_halt (dev->udev, dev->in); usb_autopm_put_interface(dev->intf); if (status < 0 && status != -EPIPE && status != -ESHUTDOWN) { if (netif_msg_rx_err (dev)) fail_halt: netdev_err(dev->net, "can't clear rx halt, status %d\n", status); } else { clear_bit (EVENT_RX_HALT, &dev->flags); if (!usbnet_going_away(dev)) tasklet_schedule(&dev->bh); } } /* tasklet could resubmit itself forever if memory is tight */ if (test_bit (EVENT_RX_MEMORY, &dev->flags)) { struct urb *urb = NULL; int resched = 1; if (netif_running (dev->net)) urb = usb_alloc_urb (0, GFP_KERNEL); else clear_bit (EVENT_RX_MEMORY, &dev->flags); if (urb != NULL) { clear_bit (EVENT_RX_MEMORY, &dev->flags); status = usb_autopm_get_interface(dev->intf); if (status < 0) { usb_free_urb(urb); goto fail_lowmem; } if (rx_submit (dev, urb, GFP_KERNEL) == -ENOLINK) resched = 0; usb_autopm_put_interface(dev->intf); fail_lowmem: if (resched) if (!usbnet_going_away(dev)) tasklet_schedule(&dev->bh); } } if (test_bit (EVENT_LINK_RESET, &dev->flags)) { const struct driver_info *info = dev->driver_info; int retval = 0; clear_bit (EVENT_LINK_RESET, &dev->flags); status = usb_autopm_get_interface(dev->intf); if (status < 0) goto skip_reset; if(info->link_reset && (retval = info->link_reset(dev)) < 0) { usb_autopm_put_interface(dev->intf); skip_reset: netdev_info(dev->net, "link reset failed (%d) usbnet usb-%s-%s, %s\n", retval, dev->udev->bus->bus_name, dev->udev->devpath, info->description); } else { usb_autopm_put_interface(dev->intf); } /* handle link change from link resetting */ __handle_link_change(dev); } if (test_bit (EVENT_LINK_CHANGE, &dev->flags)) __handle_link_change(dev); if (test_bit (EVENT_SET_RX_MODE, &dev->flags)) __handle_set_rx_mode(dev); if (dev->flags) netdev_dbg(dev->net, "kevent done, flags = 0x%lx\n", dev->flags); } /*-------------------------------------------------------------------------*/ static void tx_complete (struct urb *urb) { struct sk_buff *skb = (struct sk_buff *) urb->context; struct skb_data *entry = (struct skb_data *) skb->cb; struct usbnet *dev = entry->dev; if (urb->status == 0) { struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->net->tstats); unsigned long flags; flags = u64_stats_update_begin_irqsave(&stats64->syncp); u64_stats_add(&stats64->tx_packets, entry->packets); u64_stats_add(&stats64->tx_bytes, entry->length); u64_stats_update_end_irqrestore(&stats64->syncp, flags); } else { dev->net->stats.tx_errors++; switch (urb->status) { case -EPIPE: usbnet_defer_kevent (dev, EVENT_TX_HALT); break; /* software-driven interface shutdown */ case -ECONNRESET: // async unlink case -ESHUTDOWN: // hardware gone break; /* like rx, tx gets controller i/o faults during hub_wq * delays and so it uses the same throttling mechanism. */ case -EPROTO: case -ETIME: case -EILSEQ: usb_mark_last_busy(dev->udev); if (!timer_pending (&dev->delay)) { mod_timer (&dev->delay, jiffies + THROTTLE_JIFFIES); netif_dbg(dev, link, dev->net, "tx throttle %d\n", urb->status); } netif_stop_queue (dev->net); break; default: netif_dbg(dev, tx_err, dev->net, "tx err %d\n", entry->urb->status); break; } } usb_autopm_put_interface_async(dev->intf); (void) defer_bh(dev, skb, &dev->txq, tx_done); } /*-------------------------------------------------------------------------*/ void usbnet_tx_timeout (struct net_device *net, unsigned int txqueue) { struct usbnet *dev = netdev_priv(net); unlink_urbs (dev, &dev->txq); tasklet_schedule (&dev->bh); /* this needs to be handled individually because the generic layer * doesn't know what is sufficient and could not restore private * information if a remedy of an unconditional reset were used. */ if (dev->driver_info->recover) (dev->driver_info->recover)(dev); } EXPORT_SYMBOL_GPL(usbnet_tx_timeout); /*-------------------------------------------------------------------------*/ static int build_dma_sg(const struct sk_buff *skb, struct urb *urb) { unsigned num_sgs, total_len = 0; int i, s = 0; num_sgs = skb_shinfo(skb)->nr_frags + 1; if (num_sgs == 1) return 0; /* reserve one for zero packet */ urb->sg = kmalloc_array(num_sgs + 1, sizeof(struct scatterlist), GFP_ATOMIC); if (!urb->sg) return -ENOMEM; urb->num_sgs = num_sgs; sg_init_table(urb->sg, urb->num_sgs + 1); sg_set_buf(&urb->sg[s++], skb->data, skb_headlen(skb)); total_len += skb_headlen(skb); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *f = &skb_shinfo(skb)->frags[i]; total_len += skb_frag_size(f); sg_set_page(&urb->sg[i + s], skb_frag_page(f), skb_frag_size(f), skb_frag_off(f)); } urb->transfer_buffer_length = total_len; return 1; } netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, struct net_device *net) { struct usbnet *dev = netdev_priv(net); unsigned int length; struct urb *urb = NULL; struct skb_data *entry; const struct driver_info *info = dev->driver_info; unsigned long flags; int retval; if (skb) skb_tx_timestamp(skb); // some devices want funky USB-level framing, for // win32 driver (usually) and/or hardware quirks if (info->tx_fixup) { skb = info->tx_fixup (dev, skb, GFP_ATOMIC); if (!skb) { /* packet collected; minidriver waiting for more */ if (info->flags & FLAG_MULTI_PACKET) goto not_drop; netif_dbg(dev, tx_err, dev->net, "can't tx_fixup skb\n"); goto drop; } } if (!(urb = usb_alloc_urb (0, GFP_ATOMIC))) { netif_dbg(dev, tx_err, dev->net, "no urb\n"); goto drop; } entry = (struct skb_data *) skb->cb; entry->urb = urb; entry->dev = dev; usb_fill_bulk_urb (urb, dev->udev, dev->out, skb->data, skb->len, tx_complete, skb); if (dev->can_dma_sg) { if (build_dma_sg(skb, urb) < 0) goto drop; } length = urb->transfer_buffer_length; /* don't assume the hardware handles USB_ZERO_PACKET * NOTE: strictly conforming cdc-ether devices should expect * the ZLP here, but ignore the one-byte packet. * NOTE2: CDC NCM specification is different from CDC ECM when * handling ZLP/short packets, so cdc_ncm driver will make short * packet itself if needed. */ if (length % dev->maxpacket == 0) { if (!(info->flags & FLAG_SEND_ZLP)) { if (!(info->flags & FLAG_MULTI_PACKET)) { length++; if (skb_tailroom(skb) && !urb->num_sgs) { skb->data[skb->len] = 0; __skb_put(skb, 1); } else if (urb->num_sgs) sg_set_buf(&urb->sg[urb->num_sgs++], dev->padding_pkt, 1); } } else urb->transfer_flags |= URB_ZERO_PACKET; } urb->transfer_buffer_length = length; if (info->flags & FLAG_MULTI_PACKET) { /* Driver has set number of packets and a length delta. * Calculate the complete length and ensure that it's * positive. */ entry->length += length; if (WARN_ON_ONCE(entry->length <= 0)) entry->length = length; } else { usbnet_set_skb_tx_stats(skb, 1, length); } spin_lock_irqsave(&dev->txq.lock, flags); retval = usb_autopm_get_interface_async(dev->intf); if (retval < 0) { spin_unlock_irqrestore(&dev->txq.lock, flags); goto drop; } if (netif_queue_stopped(net)) { usb_autopm_put_interface_async(dev->intf); spin_unlock_irqrestore(&dev->txq.lock, flags); goto drop; } #ifdef CONFIG_PM /* if this triggers the device is still a sleep */ if (test_bit(EVENT_DEV_ASLEEP, &dev->flags)) { /* transmission will be done in resume */ usb_anchor_urb(urb, &dev->deferred); /* no use to process more packets */ netif_stop_queue(net); usb_put_urb(urb); spin_unlock_irqrestore(&dev->txq.lock, flags); netdev_dbg(dev->net, "Delaying transmission for resumption\n"); goto deferred; } #endif switch ((retval = usb_submit_urb (urb, GFP_ATOMIC))) { case -EPIPE: netif_stop_queue (net); usbnet_defer_kevent (dev, EVENT_TX_HALT); usb_autopm_put_interface_async(dev->intf); break; default: usb_autopm_put_interface_async(dev->intf); netif_dbg(dev, tx_err, dev->net, "tx: submit urb err %d\n", retval); break; case 0: netif_trans_update(net); __usbnet_queue_skb(&dev->txq, skb, tx_start); if (dev->txq.qlen >= TX_QLEN (dev)) netif_stop_queue (net); } spin_unlock_irqrestore (&dev->txq.lock, flags); if (retval) { netif_dbg(dev, tx_err, dev->net, "drop, code %d\n", retval); drop: dev->net->stats.tx_dropped++; not_drop: if (skb) dev_kfree_skb_any (skb); if (urb) { kfree(urb->sg); usb_free_urb(urb); } } else netif_dbg(dev, tx_queued, dev->net, "> tx, len %u, type 0x%x\n", length, skb->protocol); #ifdef CONFIG_PM deferred: #endif return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(usbnet_start_xmit); static int rx_alloc_submit(struct usbnet *dev, gfp_t flags) { struct urb *urb; int i; int ret = 0; /* don't refill the queue all at once */ for (i = 0; i < 10 && dev->rxq.qlen < RX_QLEN(dev); i++) { urb = usb_alloc_urb(0, flags); if (urb != NULL) { ret = rx_submit(dev, urb, flags); if (ret) goto err; } else { ret = -ENOMEM; goto err; } } err: return ret; } static inline void usb_free_skb(struct sk_buff *skb) { struct skb_data *entry = (struct skb_data *)skb->cb; usb_free_urb(entry->urb); dev_kfree_skb(skb); } /*-------------------------------------------------------------------------*/ // tasklet (work deferred from completions, in_irq) or timer static void usbnet_bh (struct timer_list *t) { struct usbnet *dev = from_timer(dev, t, delay); struct sk_buff *skb; struct skb_data *entry; while ((skb = skb_dequeue (&dev->done))) { entry = (struct skb_data *) skb->cb; switch (entry->state) { case rx_done: if (rx_process(dev, skb)) usb_free_skb(skb); continue; case tx_done: kfree(entry->urb->sg); fallthrough; case rx_cleanup: usb_free_skb(skb); continue; default: netdev_dbg(dev->net, "bogus skb state %d\n", entry->state); } } /* restart RX again after disabling due to high error rate */ clear_bit(EVENT_RX_KILL, &dev->flags); /* waiting for all pending urbs to complete? * only then can we forgo submitting anew */ if (waitqueue_active(&dev->wait)) { if (dev->txq.qlen + dev->rxq.qlen + dev->done.qlen == 0) wake_up_all(&dev->wait); // or are we maybe short a few urbs? } else if (netif_running (dev->net) && netif_device_present (dev->net) && netif_carrier_ok(dev->net) && !usbnet_going_away(dev) && !timer_pending(&dev->delay) && !test_bit(EVENT_RX_PAUSED, &dev->flags) && !test_bit(EVENT_RX_HALT, &dev->flags)) { int temp = dev->rxq.qlen; if (temp < RX_QLEN(dev)) { if (rx_alloc_submit(dev, GFP_ATOMIC) == -ENOLINK) return; if (temp != dev->rxq.qlen) netif_dbg(dev, link, dev->net, "rxqlen %d --> %d\n", temp, dev->rxq.qlen); if (dev->rxq.qlen < RX_QLEN(dev)) tasklet_schedule (&dev->bh); } if (dev->txq.qlen < TX_QLEN (dev)) netif_wake_queue (dev->net); } } static void usbnet_bh_tasklet(struct tasklet_struct *t) { struct usbnet *dev = from_tasklet(dev, t, bh); usbnet_bh(&dev->delay); } /*------------------------------------------------------------------------- * * USB Device Driver support * *-------------------------------------------------------------------------*/ // precondition: never called in_interrupt void usbnet_disconnect (struct usb_interface *intf) { struct usbnet *dev; struct usb_device *xdev; struct net_device *net; struct urb *urb; dev = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (!dev) return; usbnet_mark_going_away(dev); xdev = interface_to_usbdev (intf); netif_info(dev, probe, dev->net, "unregister '%s' usb-%s-%s, %s\n", intf->dev.driver->name, xdev->bus->bus_name, xdev->devpath, dev->driver_info->description); net = dev->net; unregister_netdev (net); while ((urb = usb_get_from_anchor(&dev->deferred))) { dev_kfree_skb(urb->context); kfree(urb->sg); usb_free_urb(urb); } if (dev->driver_info->unbind) dev->driver_info->unbind(dev, intf); usb_kill_urb(dev->interrupt); usb_free_urb(dev->interrupt); kfree(dev->padding_pkt); free_netdev(net); } EXPORT_SYMBOL_GPL(usbnet_disconnect); static const struct net_device_ops usbnet_netdev_ops = { .ndo_open = usbnet_open, .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, .ndo_set_rx_mode = usbnet_set_rx_mode, .ndo_change_mtu = usbnet_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, }; /*-------------------------------------------------------------------------*/ // precondition: never called in_interrupt static const struct device_type wlan_type = { .name = "wlan", }; static const struct device_type wwan_type = { .name = "wwan", }; int usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) { struct usbnet *dev; struct net_device *net; struct usb_host_interface *interface; const struct driver_info *info; struct usb_device *xdev; int status; const char *name; struct usb_driver *driver = to_usb_driver(udev->dev.driver); /* usbnet already took usb runtime pm, so have to enable the feature * for usb interface, otherwise usb_autopm_get_interface may return * failure if RUNTIME_PM is enabled. */ if (!driver->supports_autosuspend) { driver->supports_autosuspend = 1; pm_runtime_enable(&udev->dev); } name = udev->dev.driver->name; info = (const struct driver_info *) prod->driver_info; if (!info) { dev_dbg (&udev->dev, "blacklisted by %s\n", name); return -ENODEV; } xdev = interface_to_usbdev (udev); interface = udev->cur_altsetting; status = -ENOMEM; // set up our own records net = alloc_etherdev(sizeof(*dev)); if (!net) goto out; /* netdev_printk() needs this so do it as early as possible */ SET_NETDEV_DEV(net, &udev->dev); dev = netdev_priv(net); dev->udev = xdev; dev->intf = udev; dev->driver_info = info; dev->driver_name = name; dev->rx_speed = SPEED_UNSET; dev->tx_speed = SPEED_UNSET; dev->msg_enable = netif_msg_init (msg_level, NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK); init_waitqueue_head(&dev->wait); skb_queue_head_init (&dev->rxq); skb_queue_head_init (&dev->txq); skb_queue_head_init (&dev->done); skb_queue_head_init(&dev->rxq_pause); tasklet_setup(&dev->bh, usbnet_bh_tasklet); INIT_WORK (&dev->kevent, usbnet_deferred_kevent); init_usb_anchor(&dev->deferred); timer_setup(&dev->delay, usbnet_bh, 0); mutex_init (&dev->phy_mutex); mutex_init(&dev->interrupt_mutex); dev->interrupt_count = 0; dev->net = net; strscpy(net->name, "usb%d", sizeof(net->name)); /* rx and tx sides can use different message sizes; * bind() should set rx_urb_size in that case. */ dev->hard_mtu = net->mtu + net->hard_header_len; net->min_mtu = 0; net->max_mtu = ETH_MAX_MTU; net->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; net->netdev_ops = &usbnet_netdev_ops; net->watchdog_timeo = TX_TIMEOUT_JIFFIES; net->ethtool_ops = &usbnet_ethtool_ops; net->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; // allow device-specific bind/init procedures // NOTE net->name still not usable ... if (info->bind) { status = info->bind (dev, udev); if (status < 0) goto out1; // heuristic: "usb%d" for links we know are two-host, // else "eth%d" when there's reasonable doubt. userspace // can rename the link if it knows better. if ((dev->driver_info->flags & FLAG_ETHER) != 0 && ((dev->driver_info->flags & FLAG_POINTTOPOINT) == 0 || /* somebody touched it*/ !is_zero_ether_addr(net->dev_addr))) strscpy(net->name, "eth%d", sizeof(net->name)); /* WLAN devices should always be named "wlan%d" */ if ((dev->driver_info->flags & FLAG_WLAN) != 0) strscpy(net->name, "wlan%d", sizeof(net->name)); /* WWAN devices should always be named "wwan%d" */ if ((dev->driver_info->flags & FLAG_WWAN) != 0) strscpy(net->name, "wwan%d", sizeof(net->name)); /* devices that cannot do ARP */ if ((dev->driver_info->flags & FLAG_NOARP) != 0) net->flags |= IFF_NOARP; /* maybe the remote can't receive an Ethernet MTU */ if (net->mtu > (dev->hard_mtu - net->hard_header_len)) net->mtu = dev->hard_mtu - net->hard_header_len; } else if (!info->in || !info->out) status = usbnet_get_endpoints (dev, udev); else { u8 ep_addrs[3] = { info->in + USB_DIR_IN, info->out + USB_DIR_OUT, 0 }; dev->in = usb_rcvbulkpipe (xdev, info->in); dev->out = usb_sndbulkpipe (xdev, info->out); if (!(info->flags & FLAG_NO_SETINT)) status = usb_set_interface (xdev, interface->desc.bInterfaceNumber, interface->desc.bAlternateSetting); else status = 0; if (status == 0 && !usb_check_bulk_endpoints(udev, ep_addrs)) status = -EINVAL; } if (status >= 0 && dev->status) status = init_status (dev, udev); if (status < 0) goto out3; if (!dev->rx_urb_size) dev->rx_urb_size = dev->hard_mtu; dev->maxpacket = usb_maxpacket(dev->udev, dev->out); if (dev->maxpacket == 0) { /* that is a broken device */ status = -ENODEV; goto out4; } /* this flags the device for user space */ if (!is_valid_ether_addr(net->dev_addr)) eth_hw_addr_random(net); if ((dev->driver_info->flags & FLAG_WLAN) != 0) SET_NETDEV_DEVTYPE(net, &wlan_type); if ((dev->driver_info->flags & FLAG_WWAN) != 0) SET_NETDEV_DEVTYPE(net, &wwan_type); /* initialize max rx_qlen and tx_qlen */ usbnet_update_max_qlen(dev); if (dev->can_dma_sg && !(info->flags & FLAG_SEND_ZLP) && !(info->flags & FLAG_MULTI_PACKET)) { dev->padding_pkt = kzalloc(1, GFP_KERNEL); if (!dev->padding_pkt) { status = -ENOMEM; goto out4; } } status = register_netdev (net); if (status) goto out5; netif_info(dev, probe, dev->net, "register '%s' at usb-%s-%s, %s, %pM\n", udev->dev.driver->name, xdev->bus->bus_name, xdev->devpath, dev->driver_info->description, net->dev_addr); // ok, it's ready to go. usb_set_intfdata (udev, dev); netif_device_attach (net); if (dev->driver_info->flags & FLAG_LINK_INTR) usbnet_link_change(dev, 0, 0); return 0; out5: kfree(dev->padding_pkt); out4: usb_free_urb(dev->interrupt); out3: if (info->unbind) info->unbind (dev, udev); out1: /* subdrivers must undo all they did in bind() if they * fail it, but we may fail later and a deferred kevent * may trigger an error resubmitting itself and, worse, * schedule a timer. So we kill it all just in case. */ usbnet_mark_going_away(dev); cancel_work_sync(&dev->kevent); del_timer_sync(&dev->delay); free_netdev(net); out: return status; } EXPORT_SYMBOL_GPL(usbnet_probe); /*-------------------------------------------------------------------------*/ /* * suspend the whole driver as soon as the first interface is suspended * resume only when the last interface is resumed */ int usbnet_suspend (struct usb_interface *intf, pm_message_t message) { struct usbnet *dev = usb_get_intfdata(intf); if (!dev->suspend_count++) { spin_lock_irq(&dev->txq.lock); /* don't autosuspend while transmitting */ if (dev->txq.qlen && PMSG_IS_AUTO(message)) { dev->suspend_count--; spin_unlock_irq(&dev->txq.lock); return -EBUSY; } else { set_bit(EVENT_DEV_ASLEEP, &dev->flags); spin_unlock_irq(&dev->txq.lock); } /* * accelerate emptying of the rx and queues, to avoid * having everything error out. */ netif_device_detach (dev->net); usbnet_terminate_urbs(dev); __usbnet_status_stop_force(dev); /* * reattach so runtime management can use and * wake the device */ netif_device_attach (dev->net); } return 0; } EXPORT_SYMBOL_GPL(usbnet_suspend); int usbnet_resume (struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); struct sk_buff *skb; struct urb *res; int retval; if (!--dev->suspend_count) { /* resume interrupt URB if it was previously submitted */ __usbnet_status_start_force(dev, GFP_NOIO); spin_lock_irq(&dev->txq.lock); while ((res = usb_get_from_anchor(&dev->deferred))) { skb = (struct sk_buff *)res->context; retval = usb_submit_urb(res, GFP_ATOMIC); if (retval < 0) { dev_kfree_skb_any(skb); kfree(res->sg); usb_free_urb(res); usb_autopm_put_interface_async(dev->intf); } else { netif_trans_update(dev->net); __skb_queue_tail(&dev->txq, skb); } } smp_mb(); clear_bit(EVENT_DEV_ASLEEP, &dev->flags); spin_unlock_irq(&dev->txq.lock); if (test_bit(EVENT_DEV_OPEN, &dev->flags)) { /* handle remote wakeup ASAP * we cannot race against stop */ if (netif_device_present(dev->net) && !timer_pending(&dev->delay) && !test_bit(EVENT_RX_HALT, &dev->flags)) rx_alloc_submit(dev, GFP_NOIO); if (!(dev->txq.qlen >= TX_QLEN(dev))) netif_tx_wake_all_queues(dev->net); tasklet_schedule (&dev->bh); } } if (test_and_clear_bit(EVENT_DEVICE_REPORT_IDLE, &dev->flags)) usb_autopm_get_interface_no_resume(intf); return 0; } EXPORT_SYMBOL_GPL(usbnet_resume); /* * Either a subdriver implements manage_power, then it is assumed to always * be ready to be suspended or it reports the readiness to be suspended * explicitly */ void usbnet_device_suggests_idle(struct usbnet *dev) { if (!test_and_set_bit(EVENT_DEVICE_REPORT_IDLE, &dev->flags)) { dev->intf->needs_remote_wakeup = 1; usb_autopm_put_interface_async(dev->intf); } } EXPORT_SYMBOL(usbnet_device_suggests_idle); /* * For devices that can do without special commands */ int usbnet_manage_power(struct usbnet *dev, int on) { dev->intf->needs_remote_wakeup = on; return 0; } EXPORT_SYMBOL(usbnet_manage_power); void usbnet_link_change(struct usbnet *dev, bool link, bool need_reset) { /* update link after link is reseted */ if (link && !need_reset) netif_carrier_on(dev->net); else netif_carrier_off(dev->net); if (need_reset && link) usbnet_defer_kevent(dev, EVENT_LINK_RESET); else usbnet_defer_kevent(dev, EVENT_LINK_CHANGE); } EXPORT_SYMBOL(usbnet_link_change); /*-------------------------------------------------------------------------*/ static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size) { void *buf = NULL; int err = -ENOMEM; netdev_dbg(dev->net, "usbnet_read_cmd cmd=0x%02x reqtype=%02x" " value=0x%04x index=0x%04x size=%d\n", cmd, reqtype, value, index, size); if (size) { buf = kmalloc(size, GFP_NOIO); if (!buf) goto out; } err = usb_control_msg(dev->udev, usb_rcvctrlpipe(dev->udev, 0), cmd, reqtype, value, index, buf, size, USB_CTRL_GET_TIMEOUT); if (err > 0 && err <= size) { if (data) memcpy(data, buf, err); else netdev_dbg(dev->net, "Huh? Data requested but thrown away.\n"); } kfree(buf); out: return err; } static int __usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { void *buf = NULL; int err = -ENOMEM; netdev_dbg(dev->net, "usbnet_write_cmd cmd=0x%02x reqtype=%02x" " value=0x%04x index=0x%04x size=%d\n", cmd, reqtype, value, index, size); if (data) { buf = kmemdup(data, size, GFP_NOIO); if (!buf) goto out; } else { if (size) { WARN_ON_ONCE(1); err = -EINVAL; goto out; } } err = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), cmd, reqtype, value, index, buf, size, USB_CTRL_SET_TIMEOUT); kfree(buf); out: return err; } /* * The function can't be called inside suspend/resume callback, * otherwise deadlock will be caused. */ int usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size) { int ret; if (usb_autopm_get_interface(dev->intf) < 0) return -ENODEV; ret = __usbnet_read_cmd(dev, cmd, reqtype, value, index, data, size); usb_autopm_put_interface(dev->intf); return ret; } EXPORT_SYMBOL_GPL(usbnet_read_cmd); /* * The function can't be called inside suspend/resume callback, * otherwise deadlock will be caused. */ int usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { int ret; if (usb_autopm_get_interface(dev->intf) < 0) return -ENODEV; ret = __usbnet_write_cmd(dev, cmd, reqtype, value, index, data, size); usb_autopm_put_interface(dev->intf); return ret; } EXPORT_SYMBOL_GPL(usbnet_write_cmd); /* * The function can be called inside suspend/resume callback safely * and should only be called by suspend/resume callback generally. */ int usbnet_read_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, void *data, u16 size) { return __usbnet_read_cmd(dev, cmd, reqtype, value, index, data, size); } EXPORT_SYMBOL_GPL(usbnet_read_cmd_nopm); /* * The function can be called inside suspend/resume callback safely * and should only be called by suspend/resume callback generally. */ int usbnet_write_cmd_nopm(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { return __usbnet_write_cmd(dev, cmd, reqtype, value, index, data, size); } EXPORT_SYMBOL_GPL(usbnet_write_cmd_nopm); static void usbnet_async_cmd_cb(struct urb *urb) { struct usb_ctrlrequest *req = (struct usb_ctrlrequest *)urb->context; int status = urb->status; if (status < 0) dev_dbg(&urb->dev->dev, "%s failed with %d", __func__, status); kfree(req); usb_free_urb(urb); } /* * The caller must make sure that device can't be put into suspend * state until the control URB completes. */ int usbnet_write_cmd_async(struct usbnet *dev, u8 cmd, u8 reqtype, u16 value, u16 index, const void *data, u16 size) { struct usb_ctrlrequest *req; struct urb *urb; int err = -ENOMEM; void *buf = NULL; netdev_dbg(dev->net, "usbnet_write_cmd cmd=0x%02x reqtype=%02x" " value=0x%04x index=0x%04x size=%d\n", cmd, reqtype, value, index, size); urb = usb_alloc_urb(0, GFP_ATOMIC); if (!urb) goto fail; if (data) { buf = kmemdup(data, size, GFP_ATOMIC); if (!buf) { netdev_err(dev->net, "Error allocating buffer" " in %s!\n", __func__); goto fail_free_urb; } } req = kmalloc(sizeof(struct usb_ctrlrequest), GFP_ATOMIC); if (!req) goto fail_free_buf; req->bRequestType = reqtype; req->bRequest = cmd; req->wValue = cpu_to_le16(value); req->wIndex = cpu_to_le16(index); req->wLength = cpu_to_le16(size); usb_fill_control_urb(urb, dev->udev, usb_sndctrlpipe(dev->udev, 0), (void *)req, buf, size, usbnet_async_cmd_cb, req); urb->transfer_flags |= URB_FREE_BUFFER; err = usb_submit_urb(urb, GFP_ATOMIC); if (err < 0) { netdev_err(dev->net, "Error submitting the control" " message: status=%d\n", err); goto fail_free_all; } return 0; fail_free_all: kfree(req); fail_free_buf: kfree(buf); /* * avoid a double free * needed because the flag can be set only * after filling the URB */ urb->transfer_flags = 0; fail_free_urb: usb_free_urb(urb); fail: return err; } EXPORT_SYMBOL_GPL(usbnet_write_cmd_async); /*-------------------------------------------------------------------------*/ static int __init usbnet_init(void) { /* Compiler should optimize this out. */ BUILD_BUG_ON( sizeof_field(struct sk_buff, cb) < sizeof(struct skb_data)); return 0; } module_init(usbnet_init); static void __exit usbnet_exit(void) { } module_exit(usbnet_exit); MODULE_AUTHOR("David Brownell"); MODULE_DESCRIPTION("USB network driver framework"); MODULE_LICENSE("GPL"); |
10 10 10 9 9 1 1 1 2 1 1 10 10 10 8 1 1 1 1 11 16 1 9 2 7 4 4 4 4 6 6 2 4 3 4 4 1 1 4 1 6 1 4 13 13 2 2 9 4 3 3 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 | /* * Copyright (C) 2017 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/rtnetlink.h> #include <net/pkt_cls.h> #include "netdevsim.h" #define pr_vlog(env, fmt, ...) \ bpf_verifier_log_write(env, "[netdevsim] " fmt, ##__VA_ARGS__) struct nsim_bpf_bound_prog { struct nsim_dev *nsim_dev; struct bpf_prog *prog; struct dentry *ddir; const char *state; bool is_loaded; struct list_head l; }; #define NSIM_BPF_MAX_KEYS 2 struct nsim_bpf_bound_map { struct netdevsim *ns; struct bpf_offloaded_map *map; struct mutex mutex; struct nsim_map_entry { void *key; void *value; } entry[NSIM_BPF_MAX_KEYS]; struct list_head l; }; static int nsim_bpf_string_show(struct seq_file *file, void *data) { const char **str = file->private; if (*str) seq_printf(file, "%s\n", *str); return 0; } DEFINE_SHOW_ATTRIBUTE(nsim_bpf_string); static int nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) { struct nsim_bpf_bound_prog *state; int ret = 0; state = env->prog->aux->offload->dev_priv; if (state->nsim_dev->bpf_bind_verifier_delay && !insn_idx) msleep(state->nsim_dev->bpf_bind_verifier_delay); if (insn_idx == env->prog->len - 1) { pr_vlog(env, "Hello from netdevsim!\n"); if (!state->nsim_dev->bpf_bind_verifier_accept) ret = -EOPNOTSUPP; } return ret; } static int nsim_bpf_finalize(struct bpf_verifier_env *env) { return 0; } static bool nsim_xdp_offload_active(struct netdevsim *ns) { return ns->xdp_hw.prog; } static void nsim_prog_set_loaded(struct bpf_prog *prog, bool loaded) { struct nsim_bpf_bound_prog *state; if (!prog || !bpf_prog_is_offloaded(prog->aux)) return; state = prog->aux->offload->dev_priv; state->is_loaded = loaded; } static int nsim_bpf_offload(struct netdevsim *ns, struct bpf_prog *prog, bool oldprog) { nsim_prog_set_loaded(ns->bpf_offloaded, false); WARN(!!ns->bpf_offloaded != oldprog, "bad offload state, expected offload %sto be active", oldprog ? "" : "not "); ns->bpf_offloaded = prog; ns->bpf_offloaded_id = prog ? prog->aux->id : 0; nsim_prog_set_loaded(prog, true); return 0; } int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct tc_cls_bpf_offload *cls_bpf = type_data; struct bpf_prog *prog = cls_bpf->prog; struct netdevsim *ns = cb_priv; struct bpf_prog *oldprog; if (type != TC_SETUP_CLSBPF) { NSIM_EA(cls_bpf->common.extack, "only offload of BPF classifiers supported"); return -EOPNOTSUPP; } if (!tc_cls_can_offload_and_chain0(ns->netdev, &cls_bpf->common)) return -EOPNOTSUPP; if (cls_bpf->common.protocol != htons(ETH_P_ALL)) { NSIM_EA(cls_bpf->common.extack, "only ETH_P_ALL supported as filter protocol"); return -EOPNOTSUPP; } if (!ns->bpf_tc_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject BPF TC offload"); return -EOPNOTSUPP; } /* Note: progs without skip_sw will probably not be dev bound */ if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject unbound programs"); return -EOPNOTSUPP; } if (cls_bpf->command != TC_CLSBPF_OFFLOAD) return -EOPNOTSUPP; oldprog = cls_bpf->oldprog; /* Don't remove if oldprog doesn't match driver's state */ if (ns->bpf_offloaded != oldprog) { oldprog = NULL; if (!cls_bpf->prog) return 0; if (ns->bpf_offloaded) { NSIM_EA(cls_bpf->common.extack, "driver and netdev offload states mismatch"); return -EBUSY; } } return nsim_bpf_offload(ns, cls_bpf->prog, oldprog); } int nsim_bpf_disable_tc(struct netdevsim *ns) { if (ns->bpf_offloaded && !nsim_xdp_offload_active(ns)) return -EBUSY; return 0; } static int nsim_xdp_offload_prog(struct netdevsim *ns, struct netdev_bpf *bpf) { if (!nsim_xdp_offload_active(ns) && !bpf->prog) return 0; if (!nsim_xdp_offload_active(ns) && bpf->prog && ns->bpf_offloaded) { NSIM_EA(bpf->extack, "TC program is already loaded"); return -EBUSY; } return nsim_bpf_offload(ns, bpf->prog, nsim_xdp_offload_active(ns)); } static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf, struct xdp_attachment_info *xdp) { int err; if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW && !ns->bpf_xdpoffload_accept) { NSIM_EA(bpf->extack, "XDP offload disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW) { err = nsim_xdp_offload_prog(ns, bpf); if (err) return err; } xdp_attachment_setup(xdp, bpf); return 0; } static int nsim_bpf_create_prog(struct nsim_dev *nsim_dev, struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; char name[16]; int ret; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return -ENOMEM; state->nsim_dev = nsim_dev; state->prog = prog; state->state = "verify"; /* Program id is not populated yet when we create the state. */ sprintf(name, "%u", nsim_dev->prog_id_gen++); state->ddir = debugfs_create_dir(name, nsim_dev->ddir_bpf_bound_progs); if (IS_ERR(state->ddir)) { ret = PTR_ERR(state->ddir); kfree(state); return ret; } debugfs_create_u32("id", 0400, state->ddir, &prog->aux->id); debugfs_create_file("state", 0400, state->ddir, &state->state, &nsim_bpf_string_fops); debugfs_create_bool("loaded", 0400, state->ddir, &state->is_loaded); list_add_tail(&state->l, &nsim_dev->bpf_bound_progs); prog->aux->offload->dev_priv = state; return 0; } static int nsim_bpf_verifier_prep(struct bpf_prog *prog) { struct nsim_dev *nsim_dev = bpf_offload_dev_priv(prog->aux->offload->offdev); if (!nsim_dev->bpf_bind_accept) return -EOPNOTSUPP; return nsim_bpf_create_prog(nsim_dev, prog); } static int nsim_bpf_translate(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv; state->state = "xlated"; return 0; } static void nsim_bpf_destroy_prog(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; state = prog->aux->offload->dev_priv; WARN(state->is_loaded, "offload state destroyed while program still bound"); debugfs_remove_recursive(state->ddir); list_del(&state->l); kfree(state); } static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .insn_hook = nsim_bpf_verify_insn, .finalize = nsim_bpf_finalize, .prepare = nsim_bpf_verifier_prep, .translate = nsim_bpf_translate, .destroy = nsim_bpf_destroy_prog, }; static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { if (bpf->prog && bpf->prog->aux->offload) { NSIM_EA(bpf->extack, "attempt to load offloaded prog to drv"); return -EINVAL; } if (ns->netdev->mtu > NSIM_XDP_MAX_MTU) { NSIM_EA(bpf->extack, "MTU too large w/ XDP enabled"); return -EINVAL; } return 0; } static int nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { struct nsim_bpf_bound_prog *state; if (!bpf->prog) return 0; if (!bpf_prog_is_offloaded(bpf->prog->aux)) { NSIM_EA(bpf->extack, "xdpoffload of non-bound program"); return -EINVAL; } state = bpf->prog->aux->offload->dev_priv; if (WARN_ON(strcmp(state->state, "xlated"))) { NSIM_EA(bpf->extack, "offloading program in bad state"); return -EINVAL; } return 0; } static bool nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key) { return e->key && !memcmp(key, e->key, map->key_size); } static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key)) return i; return -ENOENT; } static int nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].key) return -ENOMEM; nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].value) { kfree(nmap->entry[idx].key); nmap->entry[idx].key = NULL; return -ENOMEM; } return 0; } static int nsim_map_get_next_key(struct bpf_offloaded_map *offmap, void *key, void *next_key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx = -ENOENT; mutex_lock(&nmap->mutex); if (key) idx = nsim_map_key_find(offmap, key); if (idx == -ENOENT) idx = 0; else idx++; for (; idx < ARRAY_SIZE(nmap->entry); idx++) { if (nmap->entry[idx].key) { memcpy(next_key, nmap->entry[idx].key, offmap->map.key_size); break; } } mutex_unlock(&nmap->mutex); if (idx == ARRAY_SIZE(nmap->entry)) return -ENOENT; return 0; } static int nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) memcpy(value, nmap->entry[idx].value, offmap->map.value_size); mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static int nsim_map_update_elem(struct bpf_offloaded_map *offmap, void *key, void *value, u64 flags) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx, err = 0; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx < 0 && flags == BPF_EXIST) { err = idx; goto exit_unlock; } if (idx >= 0 && flags == BPF_NOEXIST) { err = -EEXIST; goto exit_unlock; } if (idx < 0) { for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++) if (!nmap->entry[idx].key) break; if (idx == ARRAY_SIZE(nmap->entry)) { err = -E2BIG; goto exit_unlock; } err = nsim_map_alloc_elem(offmap, idx); if (err) goto exit_unlock; } memcpy(nmap->entry[idx].key, key, offmap->map.key_size); memcpy(nmap->entry[idx].value, value, offmap->map.value_size); exit_unlock: mutex_unlock(&nmap->mutex); return err; } static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) return -EINVAL; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) { kfree(nmap->entry[idx].key); kfree(nmap->entry[idx].value); memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx])); } mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static const struct bpf_map_dev_ops nsim_bpf_map_ops = { .map_get_next_key = nsim_map_get_next_key, .map_lookup_elem = nsim_map_lookup_elem, .map_update_elem = nsim_map_update_elem, .map_delete_elem = nsim_map_delete_elem, }; static int nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap; int i, err; if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && offmap->map.map_type != BPF_MAP_TYPE_HASH)) return -EINVAL; if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS) return -ENOMEM; if (offmap->map.map_flags) return -EINVAL; nmap = kzalloc(sizeof(*nmap), GFP_KERNEL_ACCOUNT); if (!nmap) return -ENOMEM; offmap->dev_priv = nmap; nmap->ns = ns; nmap->map = offmap; mutex_init(&nmap->mutex); if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) { for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { u32 *key; err = nsim_map_alloc_elem(offmap, i); if (err) goto err_free; key = nmap->entry[i].key; *key = i; memset(nmap->entry[i].value, 0, offmap->map.value_size); } } offmap->dev_ops = &nsim_bpf_map_ops; list_add_tail(&nmap->l, &ns->nsim_dev->bpf_bound_maps); return 0; err_free: while (--i >= 0) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } kfree(nmap); return err; } static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } list_del_init(&nmap->l); mutex_destroy(&nmap->mutex); kfree(nmap); } int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct netdevsim *ns = netdev_priv(dev); int err; ASSERT_RTNL(); switch (bpf->command) { case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp); case XDP_SETUP_PROG_HW: err = nsim_setup_prog_hw_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp_hw); case BPF_OFFLOAD_MAP_ALLOC: if (!ns->bpf_map_accept) return -EOPNOTSUPP; return nsim_bpf_map_alloc(ns, bpf->offmap); case BPF_OFFLOAD_MAP_FREE: nsim_bpf_map_free(bpf->offmap); return 0; default: return -EINVAL; } } int nsim_bpf_dev_init(struct nsim_dev *nsim_dev) { int err; INIT_LIST_HEAD(&nsim_dev->bpf_bound_progs); INIT_LIST_HEAD(&nsim_dev->bpf_bound_maps); nsim_dev->ddir_bpf_bound_progs = debugfs_create_dir("bpf_bound_progs", nsim_dev->ddir); if (IS_ERR(nsim_dev->ddir_bpf_bound_progs)) return PTR_ERR(nsim_dev->ddir_bpf_bound_progs); nsim_dev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops, nsim_dev); err = PTR_ERR_OR_ZERO(nsim_dev->bpf_dev); if (err) return err; nsim_dev->bpf_bind_accept = true; debugfs_create_bool("bpf_bind_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_accept); debugfs_create_u32("bpf_bind_verifier_delay", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_delay); nsim_dev->bpf_bind_verifier_accept = true; debugfs_create_bool("bpf_bind_verifier_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_accept); return 0; } void nsim_bpf_dev_exit(struct nsim_dev *nsim_dev) { WARN_ON(!list_empty(&nsim_dev->bpf_bound_progs)); WARN_ON(!list_empty(&nsim_dev->bpf_bound_maps)); bpf_offload_dev_destroy(nsim_dev->bpf_dev); } int nsim_bpf_init(struct netdevsim *ns) { struct dentry *ddir = ns->nsim_dev_port->ddir; int err; err = bpf_offload_dev_netdev_register(ns->nsim_dev->bpf_dev, ns->netdev); if (err) return err; debugfs_create_u32("bpf_offloaded_id", 0400, ddir, &ns->bpf_offloaded_id); ns->bpf_tc_accept = true; debugfs_create_bool("bpf_tc_accept", 0600, ddir, &ns->bpf_tc_accept); debugfs_create_bool("bpf_tc_non_bound_accept", 0600, ddir, &ns->bpf_tc_non_bound_accept); ns->bpf_xdpdrv_accept = true; debugfs_create_bool("bpf_xdpdrv_accept", 0600, ddir, &ns->bpf_xdpdrv_accept); ns->bpf_xdpoffload_accept = true; debugfs_create_bool("bpf_xdpoffload_accept", 0600, ddir, &ns->bpf_xdpoffload_accept); ns->bpf_map_accept = true; debugfs_create_bool("bpf_map_accept", 0600, ddir, &ns->bpf_map_accept); return 0; } void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); bpf_offload_dev_netdev_unregister(ns->nsim_dev->bpf_dev, ns->netdev); } |
154 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | /* * Copyright (C) 2014 Red Hat * Copyright (C) 2014 Intel Corp. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Rob Clark <robdclark@gmail.com> * Daniel Vetter <daniel.vetter@ffwll.ch> */ #ifndef DRM_ATOMIC_HELPER_H_ #define DRM_ATOMIC_HELPER_H_ #include <drm/drm_crtc.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_modeset_helper.h> #include <drm/drm_atomic_state_helper.h> #include <drm/drm_util.h> /* * Drivers that don't allow primary plane scaling may pass this macro in place * of the min/max scale parameters of the plane-state checker function. * * Due to src being in 16.16 fixed point and dest being in integer pixels, * 1<<16 represents no scaling. */ #define DRM_PLANE_NO_SCALING (1<<16) struct drm_atomic_state; struct drm_private_obj; struct drm_private_state; int drm_atomic_helper_check_modeset(struct drm_device *dev, struct drm_atomic_state *state); int drm_atomic_helper_check_wb_connector_state(struct drm_connector *connector, struct drm_atomic_state *state); int drm_atomic_helper_check_plane_state(struct drm_plane_state *plane_state, const struct drm_crtc_state *crtc_state, int min_scale, int max_scale, bool can_position, bool can_update_disabled); int drm_atomic_helper_check_planes(struct drm_device *dev, struct drm_atomic_state *state); int drm_atomic_helper_check_crtc_primary_plane(struct drm_crtc_state *crtc_state); int drm_atomic_helper_check(struct drm_device *dev, struct drm_atomic_state *state); void drm_atomic_helper_commit_tail(struct drm_atomic_state *state); void drm_atomic_helper_commit_tail_rpm(struct drm_atomic_state *state); int drm_atomic_helper_commit(struct drm_device *dev, struct drm_atomic_state *state, bool nonblock); int drm_atomic_helper_async_check(struct drm_device *dev, struct drm_atomic_state *state); void drm_atomic_helper_async_commit(struct drm_device *dev, struct drm_atomic_state *state); int drm_atomic_helper_wait_for_fences(struct drm_device *dev, struct drm_atomic_state *state, bool pre_swap); void drm_atomic_helper_wait_for_vblanks(struct drm_device *dev, struct drm_atomic_state *old_state); void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev, struct drm_atomic_state *old_state); void drm_atomic_helper_update_legacy_modeset_state(struct drm_device *dev, struct drm_atomic_state *old_state); void drm_atomic_helper_calc_timestamping_constants(struct drm_atomic_state *state); void drm_atomic_helper_commit_modeset_disables(struct drm_device *dev, struct drm_atomic_state *state); void drm_atomic_helper_commit_modeset_enables(struct drm_device *dev, struct drm_atomic_state *old_state); int drm_atomic_helper_prepare_planes(struct drm_device *dev, struct drm_atomic_state *state); void drm_atomic_helper_unprepare_planes(struct drm_device *dev, struct drm_atomic_state *state); #define DRM_PLANE_COMMIT_ACTIVE_ONLY BIT(0) #define DRM_PLANE_COMMIT_NO_DISABLE_AFTER_MODESET BIT(1) void drm_atomic_helper_commit_planes(struct drm_device *dev, struct drm_atomic_state *state, uint32_t flags); void drm_atomic_helper_cleanup_planes(struct drm_device *dev, struct drm_atomic_state *old_state); void drm_atomic_helper_commit_planes_on_crtc(struct drm_crtc_state *old_crtc_state); void drm_atomic_helper_disable_planes_on_crtc(struct drm_crtc_state *old_crtc_state, bool atomic); int __must_check drm_atomic_helper_swap_state(struct drm_atomic_state *state, bool stall); /* nonblocking commit helpers */ int drm_atomic_helper_setup_commit(struct drm_atomic_state *state, bool nonblock); void drm_atomic_helper_wait_for_dependencies(struct drm_atomic_state *state); void drm_atomic_helper_fake_vblank(struct drm_atomic_state *state); void drm_atomic_helper_commit_hw_done(struct drm_atomic_state *state); void drm_atomic_helper_commit_cleanup_done(struct drm_atomic_state *state); /* implementations for legacy interfaces */ int drm_atomic_helper_update_plane(struct drm_plane *plane, struct drm_crtc *crtc, struct drm_framebuffer *fb, int crtc_x, int crtc_y, unsigned int crtc_w, unsigned int crtc_h, uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h, struct drm_modeset_acquire_ctx *ctx); int drm_atomic_helper_disable_plane(struct drm_plane *plane, struct drm_modeset_acquire_ctx *ctx); int drm_atomic_helper_set_config(struct drm_mode_set *set, struct drm_modeset_acquire_ctx *ctx); int drm_atomic_helper_disable_all(struct drm_device *dev, struct drm_modeset_acquire_ctx *ctx); void drm_atomic_helper_shutdown(struct drm_device *dev); struct drm_atomic_state * drm_atomic_helper_duplicate_state(struct drm_device *dev, struct drm_modeset_acquire_ctx *ctx); struct drm_atomic_state *drm_atomic_helper_suspend(struct drm_device *dev); int drm_atomic_helper_commit_duplicated_state(struct drm_atomic_state *state, struct drm_modeset_acquire_ctx *ctx); int drm_atomic_helper_resume(struct drm_device *dev, struct drm_atomic_state *state); int drm_atomic_helper_page_flip(struct drm_crtc *crtc, struct drm_framebuffer *fb, struct drm_pending_vblank_event *event, uint32_t flags, struct drm_modeset_acquire_ctx *ctx); int drm_atomic_helper_page_flip_target( struct drm_crtc *crtc, struct drm_framebuffer *fb, struct drm_pending_vblank_event *event, uint32_t flags, uint32_t target, struct drm_modeset_acquire_ctx *ctx); /** * drm_atomic_crtc_for_each_plane - iterate over planes currently attached to CRTC * @plane: the loop cursor * @crtc: the CRTC whose planes are iterated * * This iterates over the current state, useful (for example) when applying * atomic state after it has been checked and swapped. To iterate over the * planes which *will* be attached (more useful in code called from * &drm_mode_config_funcs.atomic_check) see * drm_atomic_crtc_state_for_each_plane(). */ #define drm_atomic_crtc_for_each_plane(plane, crtc) \ drm_for_each_plane_mask(plane, (crtc)->dev, (crtc)->state->plane_mask) /** * drm_atomic_crtc_state_for_each_plane - iterate over attached planes in new state * @plane: the loop cursor * @crtc_state: the incoming CRTC state * * Similar to drm_crtc_for_each_plane(), but iterates the planes that will be * attached if the specified state is applied. Useful during for example * in code called from &drm_mode_config_funcs.atomic_check operations, to * validate the incoming state. */ #define drm_atomic_crtc_state_for_each_plane(plane, crtc_state) \ drm_for_each_plane_mask(plane, (crtc_state)->state->dev, (crtc_state)->plane_mask) /** * drm_atomic_crtc_state_for_each_plane_state - iterate over attached planes in new state * @plane: the loop cursor * @plane_state: loop cursor for the plane's state, must be const * @crtc_state: the incoming CRTC state * * Similar to drm_crtc_for_each_plane(), but iterates the planes that will be * attached if the specified state is applied. Useful during for example * in code called from &drm_mode_config_funcs.atomic_check operations, to * validate the incoming state. * * Compared to just drm_atomic_crtc_state_for_each_plane() this also fills in a * const plane_state. This is useful when a driver just wants to peek at other * active planes on this CRTC, but does not need to change it. */ #define drm_atomic_crtc_state_for_each_plane_state(plane, plane_state, crtc_state) \ drm_for_each_plane_mask(plane, (crtc_state)->state->dev, (crtc_state)->plane_mask) \ for_each_if ((plane_state = \ __drm_atomic_get_current_plane_state((crtc_state)->state, \ plane))) /** * drm_atomic_plane_enabling - check whether a plane is being enabled * @old_plane_state: old atomic plane state * @new_plane_state: new atomic plane state * * Checks the atomic state of a plane to determine whether it's being enabled * or not. This also WARNs if it detects an invalid state (both CRTC and FB * need to either both be NULL or both be non-NULL). * * RETURNS: * True if the plane is being enabled, false otherwise. */ static inline bool drm_atomic_plane_enabling(struct drm_plane_state *old_plane_state, struct drm_plane_state *new_plane_state) { /* * When enabling a plane, CRTC and FB should always be set together. * Anything else should be considered a bug in the atomic core, so we * gently warn about it. */ WARN_ON((!new_plane_state->crtc && new_plane_state->fb) || (new_plane_state->crtc && !new_plane_state->fb)); return !old_plane_state->crtc && new_plane_state->crtc; } /** * drm_atomic_plane_disabling - check whether a plane is being disabled * @old_plane_state: old atomic plane state * @new_plane_state: new atomic plane state * * Checks the atomic state of a plane to determine whether it's being disabled * or not. This also WARNs if it detects an invalid state (both CRTC and FB * need to either both be NULL or both be non-NULL). * * RETURNS: * True if the plane is being disabled, false otherwise. */ static inline bool drm_atomic_plane_disabling(struct drm_plane_state *old_plane_state, struct drm_plane_state *new_plane_state) { /* * When disabling a plane, CRTC and FB should always be NULL together. * Anything else should be considered a bug in the atomic core, so we * gently warn about it. */ WARN_ON((new_plane_state->crtc == NULL && new_plane_state->fb != NULL) || (new_plane_state->crtc != NULL && new_plane_state->fb == NULL)); return old_plane_state->crtc && !new_plane_state->crtc; } u32 * drm_atomic_helper_bridge_propagate_bus_fmt(struct drm_bridge *bridge, struct drm_bridge_state *bridge_state, struct drm_crtc_state *crtc_state, struct drm_connector_state *conn_state, u32 output_fmt, unsigned int *num_input_fmts); #endif /* DRM_ATOMIC_HELPER_H_ */ |
6 6 6 4 4 4 4 4 4 1 1 1 4 4 6 4 4 4 4 3 3 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 | // SPDX-License-Identifier: GPL-2.0 /* * Implementation of operations over local quota file */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/quota.h> #include <linux/quotaops.h> #include <linux/module.h> #include <cluster/masklog.h> #include "ocfs2_fs.h" #include "ocfs2.h" #include "inode.h" #include "alloc.h" #include "file.h" #include "buffer_head_io.h" #include "journal.h" #include "sysfile.h" #include "dlmglue.h" #include "quota.h" #include "uptodate.h" #include "super.h" #include "ocfs2_trace.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) { return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) / sizeof(struct ocfs2_local_disk_dqblk)); } /* Number of blocks with entries in one chunk */ static inline unsigned int ol_chunk_blocks(struct super_block *sb) { return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE) << 3) / ol_quota_entries_per_block(sb); } /* Number of entries in a chunk bitmap */ static unsigned int ol_chunk_entries(struct super_block *sb) { return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb); } /* Offset of the chunk in quota file */ static unsigned int ol_quota_chunk_block(struct super_block *sb, int c) { /* 1 block for local quota file info, 1 block per chunk for chunk info */ return 1 + (ol_chunk_blocks(sb) + 1) * c; } static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off) { int epb = ol_quota_entries_per_block(sb); return ol_quota_chunk_block(sb, c) + 1 + off / epb; } static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off) { int epb = ol_quota_entries_per_block(sb); return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk); } /* Offset of the dquot structure in the quota file */ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) { return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) + ol_dqblk_block_off(sb, c, off); } static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) { return off & ((1 << sb->s_blocksize_bits) - 1); } /* Compute offset in the chunk of a structure with the given offset */ static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off) { int epb = ol_quota_entries_per_block(sb); return ((off >> sb->s_blocksize_bits) - ol_quota_chunk_block(sb, c) - 1) * epb + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) / sizeof(struct ocfs2_local_disk_dqblk); } /* Write bufferhead into the fs */ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, void (*modify)(struct buffer_head *, void *), void *private) { struct super_block *sb = inode->i_sb; handle_t *handle; int status; handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); return status; } status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); ocfs2_commit_trans(OCFS2_SB(sb), handle); return status; } lock_buffer(bh); modify(bh, private); unlock_buffer(bh); ocfs2_journal_dirty(handle, bh); status = ocfs2_commit_trans(OCFS2_SB(sb), handle); if (status < 0) { mlog_errno(status); return status; } return 0; } /* * Read quota block from a given logical offset. * * This function acquires ip_alloc_sem and thus it must not be called with a * transaction started. */ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, struct buffer_head **bh) { int rc = 0; struct buffer_head *tmp = *bh; if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) return ocfs2_error(inode->i_sb, "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_block, (unsigned long long)i_size_read(inode)); rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) mlog_errno(rc); /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) *bh = tmp; return rc; } /* Check whether we understand format of quota files */ static int ocfs2_local_check_quota_file(struct super_block *sb, int type) { unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE }; struct buffer_head *bh = NULL; struct inode *linode = sb_dqopt(sb)->files[type]; struct inode *ginode = NULL; struct ocfs2_disk_dqheader *dqhead; int status, ret = 0; /* First check whether we understand local quota file */ status = ocfs2_read_quota_block(linode, 0, &bh); if (status) { mlog_errno(status); mlog(ML_ERROR, "failed to read quota file header (type=%d)\n", type); goto out_err; } dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) { mlog(ML_ERROR, "quota file magic does not match (%u != %u)," " type=%d\n", le32_to_cpu(dqhead->dqh_magic), lmagics[type], type); goto out_err; } if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) { mlog(ML_ERROR, "quota file version does not match (%u != %u)," " type=%d\n", le32_to_cpu(dqhead->dqh_version), lversions[type], type); goto out_err; } brelse(bh); bh = NULL; /* Next check whether we understand global quota file */ ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], OCFS2_INVALID_SLOT); if (!ginode) { mlog(ML_ERROR, "cannot get global quota file inode " "(type=%d)\n", type); goto out_err; } /* Since the header is read only, we don't care about locking */ status = ocfs2_read_quota_block(ginode, 0, &bh); if (status) { mlog_errno(status); mlog(ML_ERROR, "failed to read global quota file header " "(type=%d)\n", type); goto out_err; } dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) { mlog(ML_ERROR, "global quota file magic does not match " "(%u != %u), type=%d\n", le32_to_cpu(dqhead->dqh_magic), gmagics[type], type); goto out_err; } if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) { mlog(ML_ERROR, "global quota file version does not match " "(%u != %u), type=%d\n", le32_to_cpu(dqhead->dqh_version), gversions[type], type); goto out_err; } ret = 1; out_err: brelse(bh); iput(ginode); return ret; } /* Release given list of quota file chunks */ static void ocfs2_release_local_quota_bitmaps(struct list_head *head) { struct ocfs2_quota_chunk *pos, *next; list_for_each_entry_safe(pos, next, head, qc_chunk) { list_del(&pos->qc_chunk); brelse(pos->qc_headerbh); kmem_cache_free(ocfs2_qf_chunk_cachep, pos); } } /* Load quota bitmaps into memory */ static int ocfs2_load_local_quota_bitmaps(struct inode *inode, struct ocfs2_local_disk_dqinfo *ldinfo, struct list_head *head) { struct ocfs2_quota_chunk *newchunk; int i, status; INIT_LIST_HEAD(head); for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) { newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); if (!newchunk) { ocfs2_release_local_quota_bitmaps(head); return -ENOMEM; } newchunk->qc_num = i; newchunk->qc_headerbh = NULL; status = ocfs2_read_quota_block(inode, ol_quota_chunk_block(inode->i_sb, i), &newchunk->qc_headerbh); if (status) { mlog_errno(status); kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk); ocfs2_release_local_quota_bitmaps(head); return status; } list_add_tail(&newchunk->qc_chunk, head); } return 0; } static void olq_update_info(struct buffer_head *bh, void *private) { struct mem_dqinfo *info = private; struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; struct ocfs2_local_disk_dqinfo *ldinfo; ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); spin_lock(&dq_data_lock); ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags); ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); spin_unlock(&dq_data_lock); } static int ocfs2_add_recovery_chunk(struct super_block *sb, struct ocfs2_local_disk_chunk *dchunk, int chunk, struct list_head *head) { struct ocfs2_recovery_chunk *rc; rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS); if (!rc) return -ENOMEM; rc->rc_chunk = chunk; rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); if (!rc->rc_bitmap) { kfree(rc); return -ENOMEM; } memcpy(rc->rc_bitmap, dchunk->dqc_bitmap, (ol_chunk_entries(sb) + 7) >> 3); list_add_tail(&rc->rc_list, head); return 0; } static void free_recovery_list(struct list_head *head) { struct ocfs2_recovery_chunk *next; struct ocfs2_recovery_chunk *rchunk; list_for_each_entry_safe(rchunk, next, head, rc_list) { list_del(&rchunk->rc_list); kfree(rchunk->rc_bitmap); kfree(rchunk); } } void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) { int type; for (type = 0; type < OCFS2_MAXQUOTAS; type++) free_recovery_list(&(rec->r_list[type])); kfree(rec); } /* Load entries in our quota file we have to recover*/ static int ocfs2_recovery_load_quota(struct inode *lqinode, struct ocfs2_local_disk_dqinfo *ldinfo, int type, struct list_head *head) { struct super_block *sb = lqinode->i_sb; struct buffer_head *hbh; struct ocfs2_local_disk_chunk *dchunk; int i, chunks = le32_to_cpu(ldinfo->dqi_chunks); int status = 0; for (i = 0; i < chunks; i++) { hbh = NULL; status = ocfs2_read_quota_block(lqinode, ol_quota_chunk_block(sb, i), &hbh); if (status) { mlog_errno(status); break; } dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb)) status = ocfs2_add_recovery_chunk(sb, dchunk, i, head); brelse(hbh); if (status < 0) break; } if (status < 0) free_recovery_list(head); return status; } static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) { int type; struct ocfs2_quota_recovery *rec; rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); if (!rec) return NULL; for (type = 0; type < OCFS2_MAXQUOTAS; type++) INIT_LIST_HEAD(&(rec->r_list[type])); return rec; } /* Load information we need for quota recovery into memory */ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num) { unsigned int feature[OCFS2_MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct inode *lqinode; struct buffer_head *bh; int type; int status = 0; struct ocfs2_quota_recovery *rec; printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); rec = ocfs2_alloc_quota_recovery(); if (!rec) return ERR_PTR(-ENOMEM); /* First init... */ for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; /* At this point, journal of the slot is already replayed so * we can trust metadata and data of the quota file */ lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); if (!lqinode) { status = -ENOENT; goto out; } status = ocfs2_inode_lock_full(lqinode, NULL, 1, OCFS2_META_LOCK_RECOVERY); if (status < 0) { mlog_errno(status); goto out_put; } /* Now read local header */ bh = NULL; status = ocfs2_read_quota_block(lqinode, 0, &bh); if (status) { mlog_errno(status); mlog(ML_ERROR, "failed to read quota file info header " "(slot=%d type=%d)\n", slot_num, type); goto out_lock; } ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, &rec->r_list[type]); brelse(bh); out_lock: ocfs2_inode_unlock(lqinode, 1); out_put: iput(lqinode); if (status < 0) break; } out: if (status < 0) { ocfs2_free_quota_recovery(rec); rec = ERR_PTR(status); } return rec; } /* Sync changes in local quota file into global quota file and * reinitialize local quota file. * The function expects local quota file to be already locked and * s_umount locked in shared mode. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) { struct super_block *sb = lqinode->i_sb; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; struct ocfs2_local_disk_chunk *dchunk; struct ocfs2_local_disk_dqblk *dqblk; struct dquot *dquot; handle_t *handle; struct buffer_head *hbh = NULL, *qbh = NULL; int status = 0; int bit, chunk; struct ocfs2_recovery_chunk *rchunk, *next; qsize_t spacechange, inodechange; unsigned int memalloc; trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { chunk = rchunk->rc_chunk; hbh = NULL; status = ocfs2_read_quota_block(lqinode, ol_quota_chunk_block(sb, chunk), &hbh); if (status) { mlog_errno(status); break; } dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { qbh = NULL; status = ocfs2_read_quota_block(lqinode, ol_dqblk_block(sb, chunk, bit), &qbh); if (status) { mlog_errno(status); break; } dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data + ol_dqblk_block_off(sb, chunk, bit)); dquot = dqget(sb, make_kqid(&init_user_ns, type, le64_to_cpu(dqblk->dqb_id))); if (IS_ERR(dquot)) { status = PTR_ERR(dquot); mlog(ML_ERROR, "Failed to get quota structure " "for id %u, type %d. Cannot finish quota " "file recovery.\n", (unsigned)le64_to_cpu(dqblk->dqb_id), type); goto out_put_bh; } status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) { mlog_errno(status); goto out_put_dquot; } handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QSYNC_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out_drop_lock; } down_write(&sb_dqopt(sb)->dqio_sem); memalloc = memalloc_nofs_save(); spin_lock(&dquot->dq_dqb_lock); /* Add usage from quota entry into quota changes * of our node. Auxiliary variables are important * due to signedness */ spacechange = le64_to_cpu(dqblk->dqb_spacemod); inodechange = le64_to_cpu(dqblk->dqb_inodemod); dquot->dq_dqb.dqb_curspace += spacechange; dquot->dq_dqb.dqb_curinodes += inodechange; spin_unlock(&dquot->dq_dqb_lock); /* We want to drop reference held by the crashed * node. Since we have our own reference we know * global structure actually won't be freed. */ status = ocfs2_global_release_dquot(dquot); if (status < 0) { mlog_errno(status); goto out_commit; } /* Release local quota file entry */ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), qbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; } lock_buffer(qbh); WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap)); ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(qbh); ocfs2_journal_dirty(handle, qbh); out_commit: memalloc_nofs_restore(memalloc); up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(OCFS2_SB(sb), handle); out_drop_lock: ocfs2_unlock_global_qf(oinfo, 1); out_put_dquot: dqput(dquot); out_put_bh: brelse(qbh); if (status < 0) break; } brelse(hbh); list_del(&rchunk->rc_list); kfree(rchunk->rc_bitmap); kfree(rchunk); if (status < 0) break; } if (status < 0) free_recovery_list(&(rec->r_list[type])); if (status) mlog_errno(status); return status; } /* Recover local quota files for given node different from us */ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, struct ocfs2_quota_recovery *rec, int slot_num) { unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; handle_t *handle; int type; int status = 0; struct inode *lqinode; unsigned int flags; printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); down_read(&sb->s_umount); for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; trace_ocfs2_finish_quota_recovery(slot_num); lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); if (!lqinode) { status = -ENOENT; goto out; } status = ocfs2_inode_lock_full(lqinode, NULL, 1, OCFS2_META_LOCK_NOQUEUE); /* Someone else is holding the lock? Then he must be * doing the recovery. Just skip the file... */ if (status == -EAGAIN) { printk(KERN_NOTICE "ocfs2: Skipping quota recovery on " "device (%s) for slot %d because quota file is " "locked.\n", osb->dev_str, slot_num); status = 0; goto out_put; } else if (status < 0) { mlog_errno(status); goto out_put; } /* Now read local header */ bh = NULL; status = ocfs2_read_quota_block(lqinode, 0, &bh); if (status) { mlog_errno(status); mlog(ML_ERROR, "failed to read quota file info header " "(slot=%d type=%d)\n", slot_num, type); goto out_lock; } ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); /* Is recovery still needed? */ flags = le32_to_cpu(ldinfo->dqi_flags); if (!(flags & OLQF_CLEAN)) status = ocfs2_recover_local_quota_file(lqinode, type, rec); /* We don't want to mark file as clean when it is actually * active */ if (slot_num == osb->slot_num) goto out_bh; /* Mark quota file as clean if we are recovering quota file of * some other node. */ handle = ocfs2_start_trans(osb, OCFS2_LOCAL_QINFO_WRITE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out_bh; } status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_trans; } lock_buffer(bh); ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); unlock_buffer(bh); ocfs2_journal_dirty(handle, bh); out_trans: ocfs2_commit_trans(osb, handle); out_bh: brelse(bh); out_lock: ocfs2_inode_unlock(lqinode, 1); out_put: iput(lqinode); if (status < 0) break; } out: up_read(&sb->s_umount); kfree(rec); return status; } /* Read information header from quota file */ static int ocfs2_local_read_info(struct super_block *sb, int type) { struct ocfs2_local_disk_dqinfo *ldinfo; struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo; struct inode *lqinode = sb_dqopt(sb)->files[type]; int status; struct buffer_head *bh = NULL; struct ocfs2_quota_recovery *rec; int locked = 0, global_read = 0; info->dqi_max_spc_limit = 0x7fffffffffffffffLL; info->dqi_max_ino_limit = 0x7fffffffffffffffLL; oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); if (!oinfo) { mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" " info."); status = -ENOMEM; goto out_err; } info->dqi_priv = oinfo; oinfo->dqi_type = type; INIT_LIST_HEAD(&oinfo->dqi_chunk); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; oinfo->dqi_libh = NULL; status = ocfs2_global_read_info(sb, type); if (status < 0) goto out_err; global_read = 1; status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); if (status < 0) { mlog_errno(status); goto out_err; } locked = 1; /* Now read local header */ status = ocfs2_read_quota_block(lqinode, 0, &bh); if (status) { mlog_errno(status); mlog(ML_ERROR, "failed to read quota file info header " "(type=%d)\n", type); goto out_err; } ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); oinfo->dqi_libh = bh; /* We crashed when using local quota file? */ if (!(oinfo->dqi_flags & OLQF_CLEAN)) { rec = OCFS2_SB(sb)->quota_rec; if (!rec) { rec = ocfs2_alloc_quota_recovery(); if (!rec) { status = -ENOMEM; mlog_errno(status); goto out_err; } OCFS2_SB(sb)->quota_rec = rec; } status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, &rec->r_list[type]); if (status < 0) { mlog_errno(status); goto out_err; } } status = ocfs2_load_local_quota_bitmaps(lqinode, ldinfo, &oinfo->dqi_chunk); if (status < 0) { mlog_errno(status); goto out_err; } /* Now mark quota file as used */ oinfo->dqi_flags &= ~OLQF_CLEAN; status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); if (status < 0) { mlog_errno(status); goto out_err; } return 0; out_err: if (oinfo) { iput(oinfo->dqi_gqinode); ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); ocfs2_lock_res_free(&oinfo->dqi_gqlock); brelse(oinfo->dqi_lqi_bh); if (locked) ocfs2_inode_unlock(lqinode, 1); ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); if (global_read) cancel_delayed_work_sync(&oinfo->dqi_sync_work); kfree(oinfo); } brelse(bh); return status; } /* Write local info to quota file */ static int ocfs2_local_write_info(struct super_block *sb, int type) { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) ->dqi_libh; int status; status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, info); if (status < 0) { mlog_errno(status); return -1; } return 0; } /* Release info from memory */ static int ocfs2_local_free_info(struct super_block *sb, int type) { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; struct ocfs2_quota_chunk *chunk; struct ocfs2_local_disk_chunk *dchunk; int mark_clean = 1, len; int status = 0; iput(oinfo->dqi_gqinode); ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); ocfs2_lock_res_free(&oinfo->dqi_gqlock); list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { dchunk = (struct ocfs2_local_disk_chunk *) (chunk->qc_headerbh->b_data); if (chunk->qc_num < oinfo->dqi_chunks - 1) { len = ol_chunk_entries(sb); } else { len = (oinfo->dqi_blocks - ol_quota_chunk_block(sb, chunk->qc_num) - 1) * ol_quota_entries_per_block(sb); } /* Not all entries free? Bug! */ if (le32_to_cpu(dchunk->dqc_free) != len) { mlog(ML_ERROR, "releasing quota file with used " "entries (type=%d)\n", type); mark_clean = 0; } } ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); /* * s_umount held in exclusive mode protects us against racing with * recovery thread... */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); mark_clean = 0; } if (!mark_clean) goto out; /* Mark local file as clean */ oinfo->dqi_flags |= OLQF_CLEAN; status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], oinfo->dqi_libh, olq_update_info, info); if (status < 0) mlog_errno(status); out: ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); return status; } static void olq_set_dquot(struct buffer_head *bh, void *private) { struct ocfs2_dquot *od = private; struct ocfs2_local_disk_dqblk *dqblk; struct super_block *sb = od->dq_dquot.dq_sb; dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data + ol_dqblk_block_offset(sb, od->dq_local_off)); dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns, od->dq_dquot.dq_id)); spin_lock(&od->dq_dquot.dq_dqb_lock); dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - od->dq_origspace); dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - od->dq_originodes); spin_unlock(&od->dq_dquot.dq_dqb_lock); trace_olq_set_dquot( (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), from_kqid(&init_user_ns, od->dq_dquot.dq_id)); } /* Write dquot to local quota file */ int ocfs2_local_write_dquot(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); struct buffer_head *bh; struct inode *lqinode = sb_dqopt(sb)->files[dquot->dq_id.type]; int status; status = ocfs2_read_quota_phys_block(lqinode, od->dq_local_phys_blk, &bh); if (status) { mlog_errno(status); goto out; } status = ocfs2_modify_bh(lqinode, bh, olq_set_dquot, od); if (status < 0) { mlog_errno(status); goto out; } out: brelse(bh); return status; } /* Find free entry in local quota file */ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, int type, int *offset) { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; struct ocfs2_quota_chunk *chunk = NULL, *iter; struct ocfs2_local_disk_chunk *dchunk; int found = 0, len; list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) { dchunk = (struct ocfs2_local_disk_chunk *) iter->qc_headerbh->b_data; if (le32_to_cpu(dchunk->dqc_free) > 0) { chunk = iter; break; } } if (!chunk) return NULL; if (chunk->qc_num < oinfo->dqi_chunks - 1) { len = ol_chunk_entries(sb); } else { len = (oinfo->dqi_blocks - ol_quota_chunk_block(sb, chunk->qc_num) - 1) * ol_quota_entries_per_block(sb); } found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0); /* We failed? */ if (found == len) { mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" " entries free (type=%d)\n", chunk->qc_num, le32_to_cpu(dchunk->dqc_free), type); return ERR_PTR(-EIO); } *offset = found; return chunk; } /* Add new chunk to the local quota file */ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( struct super_block *sb, int type, int *offset) { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; struct inode *lqinode = sb_dqopt(sb)->files[type]; struct ocfs2_quota_chunk *chunk = NULL; struct ocfs2_local_disk_chunk *dchunk; int status; handle_t *handle; struct buffer_head *bh = NULL, *dbh = NULL; u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, i_size_read(lqinode) + 2 * sb->s_blocksize, i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, i_size_read(lqinode) + 2 * sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; } chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); if (!chunk) { status = -ENOMEM; mlog_errno(status); goto out; } /* Local quota info and two new blocks we initialize */ handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_LOCAL_QINFO_WRITE_CREDITS + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } /* Initialize chunk header */ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto out_trans; } bh = sb_getblk(sb, p_blkno); if (!bh) { status = -ENOMEM; mlog_errno(status); goto out_trans; } dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; } lock_buffer(bh); dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb)); memset(dchunk->dqc_bitmap, 0, sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); unlock_buffer(bh); ocfs2_journal_dirty(handle, bh); /* Initialize new block with structures */ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, &p_blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto out_trans; } dbh = sb_getblk(sb, p_blkno); if (!dbh) { status = -ENOMEM; mlog_errno(status); goto out_trans; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh); status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; } lock_buffer(dbh); memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); unlock_buffer(dbh); ocfs2_journal_dirty(handle, dbh); /* Update local quotafile info */ oinfo->dqi_blocks += 2; oinfo->dqi_chunks++; status = ocfs2_local_write_info(sb, type); if (status < 0) { mlog_errno(status); goto out_trans; } status = ocfs2_commit_trans(OCFS2_SB(sb), handle); if (status < 0) { mlog_errno(status); goto out; } list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk); chunk->qc_num = list_entry(chunk->qc_chunk.prev, struct ocfs2_quota_chunk, qc_chunk)->qc_num + 1; chunk->qc_headerbh = bh; *offset = 0; return chunk; out_trans: ocfs2_commit_trans(OCFS2_SB(sb), handle); out: brelse(bh); brelse(dbh); kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); return ERR_PTR(status); } /* Find free entry in local quota file */ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( struct super_block *sb, int type, int *offset) { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; struct ocfs2_quota_chunk *chunk; struct inode *lqinode = sb_dqopt(sb)->files[type]; struct ocfs2_local_disk_chunk *dchunk; int epb = ol_quota_entries_per_block(sb); unsigned int chunk_blocks; struct buffer_head *bh; u64 p_blkno; int status; handle_t *handle; if (list_empty(&oinfo->dqi_chunk)) return ocfs2_local_quota_add_chunk(sb, type, offset); /* Is the last chunk full? */ chunk = list_entry(oinfo->dqi_chunk.prev, struct ocfs2_quota_chunk, qc_chunk); chunk_blocks = oinfo->dqi_blocks - ol_quota_chunk_block(sb, chunk->qc_num) - 1; if (ol_chunk_blocks(sb) == chunk_blocks) return ocfs2_local_quota_add_chunk(sb, type, offset); /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, i_size_read(lqinode) + sb->s_blocksize, i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, i_size_read(lqinode) + sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; } /* Get buffer from the just added block */ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto out; } bh = sb_getblk(sb, p_blkno); if (!bh) { status = -ENOMEM; mlog_errno(status); goto out; } ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); /* Local quota info, chunk header and the new block we initialize */ handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_LOCAL_QINFO_WRITE_CREDITS + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } /* Zero created block */ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; } lock_buffer(bh); memset(bh->b_data, 0, sb->s_blocksize); unlock_buffer(bh); ocfs2_journal_dirty(handle, bh); /* Update chunk header */ status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_trans; } dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data; lock_buffer(chunk->qc_headerbh); le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); unlock_buffer(chunk->qc_headerbh); ocfs2_journal_dirty(handle, chunk->qc_headerbh); /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); if (status < 0) { mlog_errno(status); goto out_trans; } status = ocfs2_commit_trans(OCFS2_SB(sb), handle); if (status < 0) { mlog_errno(status); goto out; } *offset = chunk_blocks * epb; return chunk; out_trans: ocfs2_commit_trans(OCFS2_SB(sb), handle); out: return ERR_PTR(status); } static void olq_alloc_dquot(struct buffer_head *bh, void *private) { int *offset = private; struct ocfs2_local_disk_chunk *dchunk; dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, -1); } /* Create dquot in the local file for given id */ int ocfs2_create_local_dquot(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; int type = dquot->dq_id.type; struct inode *lqinode = sb_dqopt(sb)->files[type]; struct ocfs2_quota_chunk *chunk; struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); int offset; int status; u64 pcount; down_write(&OCFS2_I(lqinode)->ip_alloc_sem); chunk = ocfs2_find_free_entry(sb, type, &offset); if (!chunk) { chunk = ocfs2_extend_local_quota_file(sb, type, &offset); if (IS_ERR(chunk)) { status = PTR_ERR(chunk); goto out; } } else if (IS_ERR(chunk)) { status = PTR_ERR(chunk); goto out; } od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); od->dq_chunk = chunk; status = ocfs2_extent_map_get_blocks(lqinode, ol_dqblk_block(sb, chunk->qc_num, offset), &od->dq_local_phys_blk, &pcount, NULL); if (status < 0) { mlog_errno(status); goto out; } /* Initialize dquot structure on disk */ status = ocfs2_local_write_dquot(dquot); if (status < 0) { mlog_errno(status); goto out; } /* Mark structure as allocated */ status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot, &offset); if (status < 0) { mlog_errno(status); goto out; } out: up_write(&OCFS2_I(lqinode)->ip_alloc_sem); return status; } /* * Release dquot structure from local quota file. ocfs2_release_dquot() has * already started a transaction and written all changes to global quota file */ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) { int status; int type = dquot->dq_id.type; struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); struct super_block *sb = dquot->dq_sb; struct ocfs2_local_disk_chunk *dchunk; int offset; status = ocfs2_journal_access_dq(handle, INODE_CACHE(sb_dqopt(sb)->files[type]), od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out; } offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num, od->dq_local_off); dchunk = (struct ocfs2_local_disk_chunk *) (od->dq_chunk->qc_headerbh->b_data); /* Mark structure as freed */ lock_buffer(od->dq_chunk->qc_headerbh); ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(od->dq_chunk->qc_headerbh); ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); out: return status; } static const struct quota_format_ops ocfs2_format_ops = { .check_quota_file = ocfs2_local_check_quota_file, .read_file_info = ocfs2_local_read_info, .write_file_info = ocfs2_global_write_info, .free_file_info = ocfs2_local_free_info, }; struct quota_format_type ocfs2_quota_format = { .qf_fmt_id = QFMT_OCFS2, .qf_ops = &ocfs2_format_ops, .qf_owner = THIS_MODULE }; |
2 30 41 38 9 1 9 9 2 2 2 15 10 31 31 2 2 46 41 30 4 46 46 38 15 15 13 10 10 26 26 26 82 65 17 35 36 16 1 1 45 10 36 72 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 | // SPDX-License-Identifier: GPL-2.0 #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/list.h> #include <linux/math64.h> #include <linux/sizes.h> #include <linux/workqueue.h> #include "ctree.h" #include "block-group.h" #include "discard.h" #include "free-space-cache.h" #include "fs.h" /* * This contains the logic to handle async discard. * * Async discard manages trimming of free space outside of transaction commit. * Discarding is done by managing the block_groups on a LRU list based on free * space recency. Two passes are used to first prioritize discarding extents * and then allow for trimming in the bitmap the best opportunity to coalesce. * The block_groups are maintained on multiple lists to allow for multiple * passes with different discard filter requirements. A delayed work item is * used to manage discarding with timeout determined by a max of the delay * incurred by the iops rate limit, the byte rate limit, and the max delay of * BTRFS_DISCARD_MAX_DELAY. * * Note, this only keeps track of block_groups that are explicitly for data. * Mixed block_groups are not supported. * * The first list is special to manage discarding of fully free block groups. * This is necessary because we issue a final trim for a full free block group * after forgetting it. When a block group becomes unused, instead of directly * being added to the unused_bgs list, we add it to this first list. Then * from there, if it becomes fully discarded, we place it onto the unused_bgs * list. * * The in-memory free space cache serves as the backing state for discard. * Consequently this means there is no persistence. We opt to load all the * block groups in as not discarded, so the mount case degenerates to the * crashing case. * * As the free space cache uses bitmaps, there exists a tradeoff between * ease/efficiency for find_free_extent() and the accuracy of discard state. * Here we opt to let untrimmed regions merge with everything while only letting * trimmed regions merge with other trimmed regions. This can cause * overtrimming, but the coalescing benefit seems to be worth it. Additionally, * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, * this resets the state and we will retry trimming the whole bitmap. This is a * tradeoff between discard state accuracy and the cost of accounting. */ /* This is an initial delay to give some chance for block reuse */ #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) #define BTRFS_DISCARD_MAX_IOPS (1000U) /* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { 0, BTRFS_ASYNC_DISCARD_MAX_FILTER, BTRFS_ASYNC_DISCARD_MIN_FILTER }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, const struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } /* * Determine if async discard should be running. * * @discard_ctl: discard control * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, discard_ctl); return (!(fs_info->sb->s_flags & SB_RDONLY) && test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); } static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); if (!btrfs_run_discard_work(discard_ctl)) return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) block_group->discard_index = BTRFS_DISCARD_INDEX_START; block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; } if (list_empty(&block_group->discard_list)) btrfs_get_block_group(block_group); list_move_tail(&block_group->discard_list, get_discard_list(discard_ctl, block_group)); } static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { if (!btrfs_is_block_group_data_only(block_group)) return; spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); } static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool queued; spin_lock(&discard_ctl->lock); queued = !list_empty(&block_group->discard_list); if (!btrfs_run_discard_work(discard_ctl)) { spin_unlock(&discard_ctl->lock); return; } list_del_init(&block_group->discard_list); block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_UNUSED_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; if (!queued) btrfs_get_block_group(block_group); list_add_tail(&block_group->discard_list, &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); spin_unlock(&discard_ctl->lock); } static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool running = false; bool queued = false; spin_lock(&discard_ctl->lock); if (block_group == discard_ctl->block_group) { running = true; discard_ctl->block_group = NULL; } block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); /* * If the block group is currently running in the discard workfn, we * don't want to deref it, since it's still being used by the workfn. * The workfn will notice this case and deref the block group when it is * finished. */ if (queued && !running) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); return running; } /* * Find block_group that's up next for discarding. * * @discard_ctl: discard control * @now: current time * * Iterate over the discard lists to find the next block_group up for * discarding checking the discard_eligible_time of block_group. */ static struct btrfs_block_group *find_next_block_group( struct btrfs_discard_ctl *discard_ctl, u64 now) { struct btrfs_block_group *ret_block_group = NULL, *block_group; int i; for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { struct list_head *discard_list = &discard_ctl->discard_list[i]; if (!list_empty(discard_list)) { block_group = list_first_entry(discard_list, struct btrfs_block_group, discard_list); if (!ret_block_group) ret_block_group = block_group; if (ret_block_group->discard_eligible_time < now) break; if (ret_block_group->discard_eligible_time > block_group->discard_eligible_time) ret_block_group = block_group; } } return ret_block_group; } /* * Look up next block group and set it for use. * * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management * @now: time when discard was invoked, in ns * * Wrap find_next_block_group() and set the block_group to be in use. * @discard_state's control flow is managed here. Variables related to * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state * and @discard_index are remembered as it may change while we're discarding, * but we want the discard to execute in the context determined here. */ static struct btrfs_block_group *peek_discard_list( struct btrfs_discard_ctl *discard_ctl, enum btrfs_discard_state *discard_state, int *discard_index, u64 now) { struct btrfs_block_group *block_group; spin_lock(&discard_ctl->lock); again: block_group = find_next_block_group(discard_ctl, now); if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); } goto again; } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } discard_ctl->block_group = block_group; } if (block_group) { *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } spin_unlock(&discard_ctl->lock); return block_group; } /* * Update a block group's filters. * * @block_group: block group of interest * @bytes: recently freed region size after coalescing * * Async discard maintains multiple lists with progressively smaller filters * to prioritize discarding based on size. Should a free space that matches * a larger filter be returned to the free_space_cache, prioritize that discard * by moving @block_group to the proper filter. */ void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes) { struct btrfs_discard_ctl *discard_ctl; if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; discard_ctl = &block_group->fs_info->discard_ctl; if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && bytes >= discard_minlen[block_group->discard_index - 1]) { int i; remove_from_discard_list(discard_ctl, block_group); for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; i++) { if (bytes >= discard_minlen[i]) { block_group->discard_index = i; add_to_discard_list(discard_ctl, block_group); break; } } } } /* * Move a block group along the discard lists. * * @discard_ctl: discard control * @block_group: block_group of interest * * Increment @block_group's discard_index. If it falls of the list, let it be. * Otherwise add it back to the appropriate list. */ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { block_group->discard_index++; if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { block_group->discard_index = 1; return; } add_to_discard_list(discard_ctl, block_group); } /* * Remove a block_group from the discard lists. * * @discard_ctl: discard control * @block_group: block_group of interest * * Remove @block_group from the discard lists. If necessary, wait on the * current work and then reschedule the delayed work. */ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { if (remove_from_discard_list(discard_ctl, block_group)) { cancel_delayed_work_sync(&discard_ctl->work); btrfs_discard_schedule_work(discard_ctl, true); } } /* * Handles queuing the block_groups. * * @discard_ctl: discard control * @block_group: block_group of interest * * Maintain the LRU order of the discard lists. */ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; if (block_group->used == 0) add_to_discard_unused_list(discard_ctl, block_group); else add_to_discard_list(discard_ctl, block_group); if (!delayed_work_pending(&discard_ctl->work)) btrfs_discard_schedule_work(discard_ctl, false); } static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, u64 now, bool override) { struct btrfs_block_group *block_group; if (!btrfs_run_discard_work(discard_ctl)) return; if (!override && delayed_work_pending(&discard_ctl->work)) return; block_group = find_next_block_group(discard_ctl, now); if (block_group) { u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); /* * A single delayed workqueue item is responsible for * discarding, so we can manage the bytes rate limit by keeping * track of the previous discard. */ if (kbps_limit && discard_ctl->prev_discard) { u64 bps_limit = ((u64)kbps_limit) * SZ_1K; u64 bps_delay = div64_u64(discard_ctl->prev_discard * NSEC_PER_SEC, bps_limit); delay = max(delay, bps_delay); } /* * This timeout is to hopefully prevent immediate discarding * in a recently allocated block group. */ if (now < block_group->discard_eligible_time) { u64 bg_timeout = block_group->discard_eligible_time - now; delay = max(delay, bg_timeout); } if (override && discard_ctl->prev_discard) { u64 elapsed = now - discard_ctl->prev_discard_time; if (delay > elapsed) delay -= elapsed; else delay = 0; } mod_delayed_work(discard_ctl->discard_workers, &discard_ctl->work, nsecs_to_jiffies(delay)); } } /* * Responsible for scheduling the discard work. * * @discard_ctl: discard control * @override: override the current timer * * Discards are issued by a delayed workqueue item. @override is used to * update the current delay as the baseline delay interval is reevaluated on * transaction commit. This is also maxed with any other rate limit. */ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, bool override) { const u64 now = ktime_get_ns(); spin_lock(&discard_ctl->lock); __btrfs_discard_schedule_work(discard_ctl, now, override); spin_unlock(&discard_ctl->lock); } /* * Determine next step of a block_group. * * @discard_ctl: discard control * @block_group: block_group of interest * * Determine the next step for a block group after it's finished going through * a pass on a discard list. If it is unused and fully trimmed, we can mark it * unused and send it to the unused_bgs path. Otherwise, pass it onto the * appropriate filter list or let it fall off. */ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { remove_from_discard_list(discard_ctl, block_group); if (block_group->used == 0) { if (btrfs_is_free_space_trimmed(block_group)) btrfs_mark_bg_unused(block_group); else add_to_discard_unused_list(discard_ctl, block_group); } else { btrfs_update_discard_index(discard_ctl, block_group); } } /* * Discard work queue callback * * @work: work * * Find the next block_group to start discarding and then discard a single * region. It does this in a two-pass fashion: first extents and second * bitmaps. Completely discarded block groups are sent to the unused_bgs path. */ static void btrfs_discard_workfn(struct work_struct *work) { struct btrfs_discard_ctl *discard_ctl; struct btrfs_block_group *block_group; enum btrfs_discard_state discard_state; int discard_index = 0; u64 trimmed = 0; u64 minlen = 0; u64 now = ktime_get_ns(); discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); if (!block_group || !btrfs_run_discard_work(discard_ctl)) return; if (now < block_group->discard_eligible_time) { btrfs_discard_schedule_work(discard_ctl, false); return; } /* Perform discarding */ minlen = discard_minlen[discard_index]; if (discard_state == BTRFS_DISCARD_BITMAPS) { u64 maxlen = 0; /* * Use the previous levels minimum discard length as the max * length filter. In the case something is added to make a * region go beyond the max filter, the entire bitmap is set * back to BTRFS_TRIM_STATE_UNTRIMMED. */ if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) maxlen = discard_minlen[discard_index - 1]; btrfs_trim_block_group_bitmaps(block_group, &trimmed, block_group->discard_cursor, btrfs_block_group_end(block_group), minlen, maxlen, true); discard_ctl->discard_bitmap_bytes += trimmed; } else { btrfs_trim_block_group_extents(block_group, &trimmed, block_group->discard_cursor, btrfs_block_group_end(block_group), minlen, true); discard_ctl->discard_extent_bytes += trimmed; } /* Determine next steps for a block_group */ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { if (discard_state == BTRFS_DISCARD_BITMAPS) { btrfs_finish_discard_pass(discard_ctl, block_group); } else { block_group->discard_cursor = block_group->start; spin_lock(&discard_ctl->lock); if (block_group->discard_state != BTRFS_DISCARD_RESET_CURSOR) block_group->discard_state = BTRFS_DISCARD_BITMAPS; spin_unlock(&discard_ctl->lock); } } now = ktime_get_ns(); spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; /* * If the block group was removed from the discard list while it was * running in this workfn, then we didn't deref it, since this function * still owned that reference. But we set the discard_ctl->block_group * back to NULL, so we can use that condition to know that now we need * to deref the block_group. */ if (discard_ctl->block_group == NULL) btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); } /* * Recalculate the base delay. * * @discard_ctl: discard control * * Recalculate the base delay which is based off the total number of * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) { s32 discardable_extents; s64 discardable_bytes; u32 iops_limit; unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; unsigned long delay; discardable_extents = atomic_read(&discard_ctl->discardable_extents); if (!discardable_extents) return; spin_lock(&discard_ctl->lock); /* * The following is to fix a potential -1 discrepancy that we're not * sure how to reproduce. But given that this is the only place that * utilizes these numbers and this is only called by from * btrfs_finish_extent_commit() which is synchronized, we can correct * here. */ if (discardable_extents < 0) atomic_add(-discardable_extents, &discard_ctl->discardable_extents); discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); if (discardable_bytes < 0) atomic64_add(-discardable_bytes, &discard_ctl->discardable_bytes); if (discardable_extents <= 0) { spin_unlock(&discard_ctl->lock); return; } iops_limit = READ_ONCE(discard_ctl->iops_limit); if (iops_limit) { delay = MSEC_PER_SEC / iops_limit; } else { /* * Unset iops_limit means go as fast as possible, so allow a * delay of 0. */ delay = 0; min_delay = 0; } delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); } /* * Propagate discard counters. * * @block_group: block_group of interest * * Propagate deltas of counters up to the discard_ctl. It maintains a current * counter and a previous counter passing the delta up to the global stat. * Then the current counter value becomes the previous counter value. */ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl; struct btrfs_discard_ctl *discard_ctl; s32 extents_delta; s64 bytes_delta; if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || !btrfs_is_block_group_data_only(block_group)) return; ctl = block_group->free_space_ctl; discard_ctl = &block_group->fs_info->discard_ctl; lockdep_assert_held(&ctl->tree_lock); extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - ctl->discardable_extents[BTRFS_STAT_PREV]; if (extents_delta) { atomic_add(extents_delta, &discard_ctl->discardable_extents); ctl->discardable_extents[BTRFS_STAT_PREV] = ctl->discardable_extents[BTRFS_STAT_CURR]; } bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - ctl->discardable_bytes[BTRFS_STAT_PREV]; if (bytes_delta) { atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); ctl->discardable_bytes[BTRFS_STAT_PREV] = ctl->discardable_bytes[BTRFS_STAT_CURR]; } } /* * Punt unused_bgs list to discard lists. * * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the * order of operations is changed. In the normal synchronous discard path, the * block groups are trimmed via a single large trim in transaction commit. This * is ultimately what we are trying to avoid with asynchronous discard. Thus, * it must be done before going down the unused_bgs path. */ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) { struct btrfs_block_group *block_group, *next; spin_lock(&fs_info->unused_bgs_lock); /* We enabled async discard, so punt all to the queue */ list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); /* * This put is for the get done by btrfs_mark_bg_unused. * Queueing discard incremented it for discard's reference. */ btrfs_put_block_group(block_group); } spin_unlock(&fs_info->unused_bgs_lock); } /* * Purge discard lists. * * @discard_ctl: discard control * * If we are disabling async discard, we may have intercepted block groups that * are completely free and ready for the unused_bgs path. As discarding will * now happen in transaction commit or not at all, we can safely mark the * corresponding block groups as unused and they will be sent on their merry * way to the unused_bgs list. */ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) { struct btrfs_block_group *block_group, *next; int i; spin_lock(&discard_ctl->lock); for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { list_for_each_entry_safe(block_group, next, &discard_ctl->discard_list[i], discard_list) { list_del_init(&block_group->discard_list); spin_unlock(&discard_ctl->lock); if (block_group->used == 0) btrfs_mark_bg_unused(block_group); spin_lock(&discard_ctl->lock); btrfs_put_block_group(block_group); } } spin_unlock(&discard_ctl->lock); } void btrfs_discard_resume(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_cleanup(fs_info); return; } btrfs_discard_punt_unused_bgs_list(fs_info); set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); } void btrfs_discard_stop(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); } void btrfs_discard_init(struct btrfs_fs_info *fs_info) { struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; int i; spin_lock_init(&discard_ctl->lock); INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) INIT_LIST_HEAD(&discard_ctl->discard_list[i]); discard_ctl->prev_discard = 0; discard_ctl->prev_discard_time = 0; atomic_set(&discard_ctl->discardable_extents, 0); atomic64_set(&discard_ctl->discardable_bytes, 0); discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; discard_ctl->kbps_limit = 0; discard_ctl->discard_extent_bytes = 0; discard_ctl->discard_bitmap_bytes = 0; atomic64_set(&discard_ctl->discard_bytes_saved, 0); } void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) { btrfs_discard_stop(fs_info); cancel_delayed_work_sync(&fs_info->discard_ctl.work); btrfs_discard_purge_list(&fs_info->discard_ctl); } |
38 38 38 38 38 38 13 38 37 38 16 38 36 36 36 36 16 16 16 38 38 37 38 38 38 16 15 16 16 16 38 38 37 2 38 36 16 16 16 16 15 13 12 2 2 38 38 38 38 38 38 2 38 38 38 38 38 2 2 18 18 16 38 9 9 11 38 8 32 33 33 2 2 33 33 33 33 34 33 9 31 16 16 16 15 16 16 16 16 16 9 9 16 16 16 16 16 14 14 9 5 14 14 14 14 38 38 38 29 30 13 13 10 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_alloc.h" #include "xfs_log.h" #include "xfs_btree_staging.h" #include "xfs_ag.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_bmap_btree.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_health.h" #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" /* * Btree magic numbers. */ uint32_t xfs_btree_magic( struct xfs_mount *mp, const struct xfs_btree_ops *ops) { int idx = xfs_has_crc(mp) ? 1 : 0; __be32 magic = ops->buf_ops->magic[idx]; /* Ensure we asked for crc for crc-only magics. */ ASSERT(magic != 0); return be32_to_cpu(magic); } /* * These sibling pointer checks are optimised for null sibling pointers. This * happens a lot, and we don't need to byte swap at runtime if the sibling * pointer is NULL. * * These are explicitly marked at inline because the cost of calling them as * functions instead of inlining them is about 36 bytes extra code per call site * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these * two sibling check functions reduces the compiled code size by over 300 * bytes. */ static inline xfs_failaddr_t xfs_btree_check_fsblock_siblings( struct xfs_mount *mp, xfs_fsblock_t fsb, __be64 dsibling) { xfs_fsblock_t sibling; if (dsibling == cpu_to_be64(NULLFSBLOCK)) return NULL; sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; if (!xfs_verify_fsbno(mp, sibling)) return __this_address; return NULL; } static inline xfs_failaddr_t xfs_btree_check_memblock_siblings( struct xfs_buftarg *btp, xfbno_t bno, __be64 dsibling) { xfbno_t sibling; if (dsibling == cpu_to_be64(NULLFSBLOCK)) return NULL; sibling = be64_to_cpu(dsibling); if (sibling == bno) return __this_address; if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) return __this_address; return NULL; } static inline xfs_failaddr_t xfs_btree_check_agblock_siblings( struct xfs_perag *pag, xfs_agblock_t agbno, __be32 dsibling) { xfs_agblock_t sibling; if (dsibling == cpu_to_be32(NULLAGBLOCK)) return NULL; sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; if (!xfs_verify_agbno(pag, sibling)) return __this_address; return NULL; } static xfs_failaddr_t __xfs_btree_check_lblock_hdr( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; if (block->bb_u.l.bb_pad != cpu_to_be32(0)) return __this_address; } if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; return NULL; } /* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. */ static xfs_failaddr_t __xfs_btree_check_fsblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; xfs_failaddr_t fa; xfs_fsblock_t fsb; fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); if (fa) return fa; /* * For inode-rooted btrees, the root block sits in the inode fork. In * that case bp is NULL, and the block must not have any siblings. */ if (!bp) { if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) return __this_address; if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) return __this_address; return NULL; } fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } /* * Check an in-memory btree block header. Return the address of the failing * check, or NULL if everything is ok. */ static xfs_failaddr_t __xfs_btree_check_memblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; xfs_failaddr_t fa; xfbno_t bno; fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); if (fa) return fa; bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); fa = xfs_btree_check_memblock_siblings(btp, bno, block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_memblock_siblings(btp, bno, block->bb_u.l.bb_rightsib); return fa; } /* * Check a short btree block header. Return the address of the failing check, * or NULL if everything is ok. */ static xfs_failaddr_t __xfs_btree_check_agblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = to_perag(cur->bc_group); xfs_failaddr_t fa; xfs_agblock_t agbno; if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; } if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_rightsib); return fa; } /* * Internal btree block check. * * Return NULL if the block is ok or the address of the failed check otherwise. */ xfs_failaddr_t __xfs_btree_check_block( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: return __xfs_btree_check_memblock(cur, block, level, bp); case XFS_BTREE_TYPE_AG: return __xfs_btree_check_agblock(cur, block, level, bp); case XFS_BTREE_TYPE_INODE: return __xfs_btree_check_fsblock(cur, block, level, bp); default: ASSERT(0); return __this_address; } } static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) { if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) return XFS_ERRTAG_BTREE_CHECK_SBLOCK; return XFS_ERRTAG_BTREE_CHECK_LBLOCK; } /* * Debug routine: check that block header is ok. */ int xfs_btree_check_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block, if any */ { struct xfs_mount *mp = cur->bc_mp; xfs_failaddr_t fa; fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } int __xfs_btree_check_ptr( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level) { if (level <= 0) return -EFSCORRUPTED; switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, be64_to_cpu((&ptr->l)[index]))) return -EFSCORRUPTED; break; case XFS_BTREE_TYPE_INODE: if (!xfs_verify_fsbno(cur->bc_mp, be64_to_cpu((&ptr->l)[index]))) return -EFSCORRUPTED; break; case XFS_BTREE_TYPE_AG: if (!xfs_verify_agbno(to_perag(cur->bc_group), be32_to_cpu((&ptr->s)[index]))) return -EFSCORRUPTED; break; } return 0; } /* * Check that a given (indexed) btree pointer at a certain level of a * btree is valid and doesn't point past where it should. */ static int xfs_btree_check_ptr( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level) { int error; error = __xfs_btree_check_ptr(cur, ptr, index, level); if (error) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: xfs_err(cur->bc_mp, "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", cur->bc_ops->name, cur->bc_flags, level, index, __this_address); break; case XFS_BTREE_TYPE_INODE: xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork, cur->bc_ops->name, level, index); break; case XFS_BTREE_TYPE_AG: xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", cur->bc_group->xg_gno, cur->bc_ops->name, level, index); break; } xfs_btree_mark_sick(cur); } return error; } #ifdef DEBUG # define xfs_btree_debug_check_ptr xfs_btree_check_ptr #else # define xfs_btree_debug_check_ptr(...) (0) #endif /* * Calculate CRC on the whole btree block and stuff it into the * long-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void xfs_btree_fsblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); } bool xfs_btree_fsblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); } return true; } /* * Calculate CRC on the whole btree block and stuff it into the * short-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void xfs_btree_agblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } bool xfs_btree_agblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } return true; } static int xfs_btree_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { int error; trace_xfs_btree_free_block(cur, bp); /* * Don't allow block freeing for a staging cursor, because staging * cursors do not support regular btree modifications. */ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { ASSERT(0); return -EFSCORRUPTED; } error = cur->bc_ops->free_block(cur, bp); if (!error) { xfs_trans_binval(cur->bc_tp, bp); XFS_BTREE_STATS_INC(cur, free); } return error; } /* * Delete the btree cursor. */ void xfs_btree_del_cursor( struct xfs_btree_cur *cur, /* btree cursor */ int error) /* del because of error */ { int i; /* btree level */ /* * Clear the buffer pointers and release the buffers. If we're doing * this because of an error, inspect all of the entries in the bc_bufs * array for buffers to be unlocked. This is because some of the btree * code works from level n down to 0, and if we get an error along the * way we won't have initialized all the entries down to 0. */ for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_levels[i].bp) xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); else if (!error) break; } /* * If we are doing a BMBT update, the number of unaccounted blocks * allocated during this cursor life time should be zero. If it's not * zero, then we should be shut down or on our way to shutdown due to * cancelling a dirty transaction on error. */ ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); if (cur->bc_group) xfs_group_put(cur->bc_group); kmem_cache_free(cur->bc_cache, cur); } /* Return the buffer target for this btree's buffer. */ static inline struct xfs_buftarg * xfs_btree_buftarg( struct xfs_btree_cur *cur) { if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) return cur->bc_mem.xfbtree->target; return cur->bc_mp->m_ddev_targp; } /* Return the block size (in units of 512b sectors) for this btree. */ static inline unsigned int xfs_btree_bbsize( struct xfs_btree_cur *cur) { if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) return XFBNO_BBSIZE; return cur->bc_mp->m_bsize; } /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ int /* error */ xfs_btree_dup_cursor( struct xfs_btree_cur *cur, /* input cursor */ struct xfs_btree_cur **ncur) /* output cursor */ { struct xfs_mount *mp = cur->bc_mp; struct xfs_trans *tp = cur->bc_tp; struct xfs_buf *bp; struct xfs_btree_cur *new; int error; int i; /* * Don't allow staging cursors to be duplicated because they're supposed * to be kept private to a single thread. */ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { ASSERT(0); return -EFSCORRUPTED; } /* * Allocate a new cursor like the old one. */ new = cur->bc_ops->dup_cursor(cur); /* * Copy the record currently in the cursor. */ new->bc_rec = cur->bc_rec; /* * For each level current, re-get the buffer and copy the ptr value. */ for (i = 0; i < new->bc_nlevels; i++) { new->bc_levels[i].ptr = cur->bc_levels[i].ptr; new->bc_levels[i].ra = cur->bc_levels[i].ra; bp = cur->bc_levels[i].bp; if (bp) { error = xfs_trans_read_buf(mp, tp, xfs_btree_buftarg(cur), xfs_buf_daddr(bp), xfs_btree_bbsize(cur), 0, &bp, cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(new); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; } } new->bc_levels[i].bp = bp; } *ncur = new; return 0; } /* * XFS btree block layout and addressing: * * There are two types of blocks in the btree: leaf and non-leaf blocks. * * The leaf record start with a header then followed by records containing * the values. A non-leaf block also starts with the same header, and * then first contains lookup keys followed by an equal number of pointers * to the btree blocks at the previous level. * * +--------+-------+-------+-------+-------+-------+-------+ * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | * +--------+-------+-------+-------+-------+-------+-------+ * * +--------+-------+-------+-------+-------+-------+-------+ * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | * +--------+-------+-------+-------+-------+-------+-------+ * * The header is called struct xfs_btree_block for reasons better left unknown * and comes in different versions for short (32bit) and long (64bit) block * pointers. The record and key structures are defined by the btree instances * and opaque to the btree core. The block pointers are simple disk endian * integers, available in a short (32bit) and long (64bit) variant. * * The helpers below calculate the offset of a given record, key or pointer * into a btree block (xfs_btree_*_offset) or return a pointer to the given * record, key or pointer (xfs_btree_*_addr). Note that all addressing * inside the btree block is done using indices starting at one, not zero! * * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing * overlapping intervals. In such a tree, records are still sorted lowest to * highest and indexed by the smallest key value that refers to the record. * However, nodes are different: each pointer has two associated keys -- one * indexing the lowest key available in the block(s) below (the same behavior * as the key in a regular btree) and another indexing the highest key * available in the block(s) below. Because records are /not/ sorted by the * highest key, all leaf block updates require us to compute the highest key * that matches any record in the leaf and to recursively update the high keys * in the nodes going further up in the tree, if necessary. Nodes look like * this: * * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ * * To perform an interval query on an overlapped tree, perform the usual * depth-first search and use the low and high keys to decide if we can skip * that particular node. If a leaf node is reached, return the records that * intersect the interval. Note that an interval query may return numerous * entries. For a non-overlapped tree, simply search for the record associated * with the lowest key and iterate forward until a non-matching record is * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in * more detail. * * Why do we care about overlapping intervals? Let's say you have a bunch of * reverse mapping records on a reflink filesystem: * * 1: +- file A startblock B offset C length D -----------+ * 2: +- file E startblock F offset G length H --------------+ * 3: +- file I startblock F offset J length K --+ * 4: +- file L... --+ * * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, * we'd simply increment the length of record 1. But how do we find the record * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return * record 3 because the keys are ordered first by startblock. An interval * query would return records 1 and 2 because they both overlap (B+D-1), and * from that we can pick out record 1 as the appropriate left neighbor. * * In the non-overlapped case you can do a LE lookup and decrement the cursor * because a record's interval must end before the next record. */ /* * Return size of the btree block header for this btree instance. */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_LBLOCK_CRC_LEN; return XFS_BTREE_LBLOCK_LEN; } if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_SBLOCK_CRC_LEN; return XFS_BTREE_SBLOCK_LEN; } /* * Calculate offset of the n-th record in a btree block. */ STATIC size_t xfs_btree_rec_offset( struct xfs_btree_cur *cur, int n) { return xfs_btree_block_len(cur) + (n - 1) * cur->bc_ops->rec_len; } /* * Calculate offset of the n-th key in a btree block. */ STATIC size_t xfs_btree_key_offset( struct xfs_btree_cur *cur, int n) { return xfs_btree_block_len(cur) + (n - 1) * cur->bc_ops->key_len; } /* * Calculate offset of the n-th high key in a btree block. */ STATIC size_t xfs_btree_high_key_offset( struct xfs_btree_cur *cur, int n) { return xfs_btree_block_len(cur) + (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); } /* * Calculate offset of the n-th block pointer in a btree block. */ STATIC size_t xfs_btree_ptr_offset( struct xfs_btree_cur *cur, int n, int level) { return xfs_btree_block_len(cur) + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + (n - 1) * cur->bc_ops->ptr_len; } /* * Return a pointer to the n-th record in the btree block. */ union xfs_btree_rec * xfs_btree_rec_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { return (union xfs_btree_rec *) ((char *)block + xfs_btree_rec_offset(cur, n)); } /* * Return a pointer to the n-th key in the btree block. */ union xfs_btree_key * xfs_btree_key_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { return (union xfs_btree_key *) ((char *)block + xfs_btree_key_offset(cur, n)); } /* * Return a pointer to the n-th high key in the btree block. */ union xfs_btree_key * xfs_btree_high_key_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { return (union xfs_btree_key *) ((char *)block + xfs_btree_high_key_offset(cur, n)); } /* * Return a pointer to the n-th block pointer in the btree block. */ union xfs_btree_ptr * xfs_btree_ptr_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { int level = xfs_btree_get_level(block); ASSERT(block->bb_level != 0); return (union xfs_btree_ptr *) ((char *)block + xfs_btree_ptr_offset(cur, n, level)); } struct xfs_ifork * xfs_btree_ifork_ptr( struct xfs_btree_cur *cur) { ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); } /* * Get the root block which is stored in the inode. * * For now this btree implementation assumes the btree root is always * stored in the if_broot field of an inode fork. */ STATIC struct xfs_btree_block * xfs_btree_get_iroot( struct xfs_btree_cur *cur) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return (struct xfs_btree_block *)ifp->if_broot; } /* * Retrieve the block pointer from the cursor at the given level. * This may be an inode btree root or from a buffer. */ struct xfs_btree_block * /* generic btree block pointer */ xfs_btree_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in btree */ struct xfs_buf **bpp) /* buffer containing the block */ { if (xfs_btree_at_iroot(cur, level)) { *bpp = NULL; return xfs_btree_get_iroot(cur); } *bpp = cur->bc_levels[level].bp; return XFS_BUF_TO_BLOCK(*bpp); } /* * Change the cursor to point to the first record at the given level. * Other levels are unaffected. */ STATIC int /* success=1, failure=0 */ xfs_btree_firstrec( struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ struct xfs_buf *bp; /* buffer containing block */ /* * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); if (xfs_btree_check_block(cur, block, level, bp)) return 0; /* * It's empty, there is no such record. */ if (!block->bb_numrecs) return 0; /* * Set the ptr value to 1, that's the first record/key. */ cur->bc_levels[level].ptr = 1; return 1; } /* * Change the cursor to point to the last record in the current block * at the given level. Other levels are unaffected. */ STATIC int /* success=1, failure=0 */ xfs_btree_lastrec( struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ struct xfs_buf *bp; /* buffer containing block */ /* * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); if (xfs_btree_check_block(cur, block, level, bp)) return 0; /* * It's empty, there is no such record. */ if (!block->bb_numrecs) return 0; /* * Set the ptr value to numrecs, that's the last record/key. */ cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); return 1; } /* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ void xfs_btree_offsets( uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; } } /* * Find the highest bit, so the last byte offset. */ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; } } } STATIC int xfs_btree_readahead_fsblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { struct xfs_mount *mp = cur->bc_mp; xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } return rval; } STATIC int xfs_btree_readahead_memblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, cur->bc_ops->buf_ops); rval++; } return rval; } STATIC int xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, xfs_agbno_to_daddr(pag, left), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, xfs_agbno_to_daddr(pag, right), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } return rval; } /* * Read-ahead btree blocks, at the given level. * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. */ STATIC int xfs_btree_readahead( struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ int lr) /* left/right bits */ { struct xfs_btree_block *block; /* * No readahead needed if we are at the root level and the * btree root is stored in the inode. */ if (xfs_btree_at_iroot(cur, lev)) return 0; if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) return 0; cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: return xfs_btree_readahead_agblock(cur, lr, block); case XFS_BTREE_TYPE_INODE: return xfs_btree_readahead_fsblock(cur, lr, block); case XFS_BTREE_TYPE_MEM: return xfs_btree_readahead_memblock(cur, lr, block); default: ASSERT(0); return 0; } } STATIC int xfs_btree_ptr_to_daddr( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, xfs_daddr_t *daddr) { int error; error = xfs_btree_check_ptr(cur, ptr, 0, 1); if (error) return error; switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), be32_to_cpu(ptr->s)); break; case XFS_BTREE_TYPE_INODE: *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); break; case XFS_BTREE_TYPE_MEM: *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); break; } return 0; } /* * Readahead @count btree blocks at the given @ptr location. * * We don't need to care about long or short form btrees here as we have a * method of converting the ptr directly to a daddr available to us. */ STATIC void xfs_btree_readahead_ptr( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, xfs_extlen_t count) { xfs_daddr_t daddr; if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) return; xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, xfs_btree_bbsize(cur) * count, cur->bc_ops->buf_ops); } /* * Set the buffer for level "lev" in the cursor to bp, releasing * any previous buffer. */ STATIC void xfs_btree_setbuf( struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ struct xfs_buf *bp) /* new buffer to set */ { struct xfs_btree_block *b; /* btree block */ if (cur->bc_levels[lev].bp) xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); cur->bc_levels[lev].bp = bp; cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } else { if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } } bool xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return ptr->l == cpu_to_be64(NULLFSBLOCK); else return ptr->s == cpu_to_be32(NULLAGBLOCK); } void xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(NULLFSBLOCK); else ptr->s = cpu_to_be32(NULLAGBLOCK); } static inline bool xfs_btree_ptrs_equal( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr1, union xfs_btree_ptr *ptr2) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return ptr1->l == ptr2->l; return ptr1->s == ptr2->s; } /* * Get/set/init sibling pointers */ void xfs_btree_get_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr) { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) ptr->l = block->bb_u.l.bb_rightsib; else ptr->l = block->bb_u.l.bb_leftsib; } else { if (lr == XFS_BB_RIGHTSIB) ptr->s = block->bb_u.s.bb_rightsib; else ptr->s = block->bb_u.s.bb_leftsib; } } void xfs_btree_set_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr) { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) block->bb_u.l.bb_rightsib = ptr->l; else block->bb_u.l.bb_leftsib = ptr->l; } else { if (lr == XFS_BB_RIGHTSIB) block->bb_u.s.bb_rightsib = ptr->s; else block->bb_u.s.bb_leftsib = ptr->s; } } static void __xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, xfs_daddr_t blkno, __u16 level, __u16 numrecs, __u64 owner) { bool crc = xfs_has_crc(mp); __u32 magic = xfs_btree_magic(mp, ops); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (crc) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.l.bb_pad = 0; buf->bb_u.l.bb_lsn = 0; } } else { buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); if (crc) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); /* owner is a 32 bit value on short blocks */ buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } } } void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *block, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner) { __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, numrecs, owner); } void xfs_btree_init_buf( struct xfs_mount *mp, struct xfs_buf *bp, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner) { __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, xfs_buf_daddr(bp), level, numrecs, owner); bp->b_ops = ops->buf_ops; } static inline __u64 xfs_btree_owner( struct xfs_btree_cur *cur) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: return cur->bc_mem.xfbtree->owner; case XFS_BTREE_TYPE_INODE: return cur->bc_ino.ip->i_ino; case XFS_BTREE_TYPE_AG: return cur->bc_group->xg_gno; default: ASSERT(0); return 0; } } void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, int numrecs) { xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, xfs_btree_owner(cur)); } STATIC void xfs_btree_buf_to_ptr( struct xfs_btree_cur *cur, struct xfs_buf *bp, union xfs_btree_ptr *ptr) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp))); break; case XFS_BTREE_TYPE_INODE: ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp))); break; case XFS_BTREE_TYPE_MEM: ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); break; } } static inline void xfs_btree_set_refs( struct xfs_btree_cur *cur, struct xfs_buf *bp) { xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); } int xfs_btree_get_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp) { xfs_daddr_t d; int error; error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, xfs_btree_bbsize(cur), 0, bpp); if (error) return error; (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } /* * Read in the buffer at the given ptr and return the buffer and * the block pointer within the buffer. */ int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; int error; /* need to sort out how callers deal with failures first */ ASSERT(!(flags & XBF_TRYLOCK)); error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, xfs_btree_bbsize(cur), flags, bpp, cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) return error; xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } /* * Copy keys from one btree block to another. */ void xfs_btree_copy_keys( struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys) { ASSERT(numkeys >= 0); memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); } /* * Copy records from one btree block to another. */ STATIC void xfs_btree_copy_recs( struct xfs_btree_cur *cur, union xfs_btree_rec *dst_rec, union xfs_btree_rec *src_rec, int numrecs) { ASSERT(numrecs >= 0); memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); } /* * Copy block pointers from one btree block to another. */ void xfs_btree_copy_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, const union xfs_btree_ptr *src_ptr, int numptrs) { ASSERT(numptrs >= 0); memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); } /* * Shift keys one index left/right inside a single btree block. */ STATIC void xfs_btree_shift_keys( struct xfs_btree_cur *cur, union xfs_btree_key *key, int dir, int numkeys) { char *dst_key; ASSERT(numkeys >= 0); ASSERT(dir == 1 || dir == -1); dst_key = (char *)key + (dir * cur->bc_ops->key_len); memmove(dst_key, key, numkeys * cur->bc_ops->key_len); } /* * Shift records one index left/right inside a single btree block. */ STATIC void xfs_btree_shift_recs( struct xfs_btree_cur *cur, union xfs_btree_rec *rec, int dir, int numrecs) { char *dst_rec; ASSERT(numrecs >= 0); ASSERT(dir == 1 || dir == -1); dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); } /* * Shift block pointers one index left/right inside a single btree block. */ STATIC void xfs_btree_shift_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int dir, int numptrs) { char *dst_ptr; ASSERT(numptrs >= 0); ASSERT(dir == 1 || dir == -1); dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); } /* * Log key values from the btree block. */ STATIC void xfs_btree_log_keys( struct xfs_btree_cur *cur, struct xfs_buf *bp, int first, int last) { if (bp) { xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } /* * Log record values from the btree block. */ void xfs_btree_log_recs( struct xfs_btree_cur *cur, struct xfs_buf *bp, int first, int last) { xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); } /* * Log block pointer fields from a btree block (nonleaf). */ STATIC void xfs_btree_log_ptrs( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ int first, /* index of first pointer to log */ int last) /* index of last pointer to log */ { if (bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); int level = xfs_btree_get_level(block); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } /* * Log fields from a btree block header. */ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ static const short soffsets[] = { /* table of offsets (short) */ offsetof(struct xfs_btree_block, bb_magic), offsetof(struct xfs_btree_block, bb_level), offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), offsetof(struct xfs_btree_block, bb_u.s.bb_owner), offsetof(struct xfs_btree_block, bb_u.s.bb_crc), XFS_BTREE_SBLOCK_CRC_LEN }; static const short loffsets[] = { /* table of offsets (long) */ offsetof(struct xfs_btree_block, bb_magic), offsetof(struct xfs_btree_block, bb_level), offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), offsetof(struct xfs_btree_block, bb_u.l.bb_owner), offsetof(struct xfs_btree_block, bb_u.l.bb_crc), offsetof(struct xfs_btree_block, bb_u.l.bb_pad), XFS_BTREE_LBLOCK_CRC_LEN }; if (bp) { int nbits; if (xfs_has_crc(cur->bc_mp)) { /* * We don't log the CRC when updating a btree * block but instead recreate it during log * recovery. As the log buffers have checksums * of their own this is safe and avoids logging a crc * update in a lot of places. */ if (fields == XFS_BB_ALL_BITS) fields = XFS_BB_ALL_BITS_CRC; nbits = XFS_BB_NUM_BITS_CRC; } else { nbits = XFS_BB_NUM_BITS; } xfs_btree_offsets(fields, (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? loffsets : soffsets, nbits, &first, &last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } /* * Increment cursor by one record at the level. * For nonzero levels the leaf-ward information is untouched. */ int /* error */ xfs_btree_increment( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_btree_block *block; union xfs_btree_ptr ptr; struct xfs_buf *bp; int error; /* error return value */ int lev; ASSERT(level < cur->bc_nlevels); /* Read-ahead to the right at this level. */ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); /* Get a pointer to the btree block. */ block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* We're done if we remain in the block after the increment. */ if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) goto out1; /* Fail if we just went off the right edge of the tree. */ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); if (xfs_btree_ptr_is_null(cur, &ptr)) goto out0; XFS_BTREE_STATS_INC(cur, increment); /* * March up the tree incrementing pointers. * Stop when we don't go off the right edge of a block. */ for (lev = level + 1; lev < cur->bc_nlevels; lev++) { block = xfs_btree_get_block(cur, lev, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, lev, bp); if (error) goto error0; #endif if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) break; /* Read-ahead the right block for the next loop. */ xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); } /* * If we went off the root then we are either seriously * confused or have the tree root in an inode. */ if (lev == cur->bc_nlevels) { if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } ASSERT(lev < cur->bc_nlevels); /* * Now walk back down the tree, fixing up the cursor's buffer * pointers and key numbers. */ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); cur->bc_levels[lev].ptr = 1; } out1: *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; } /* * Decrement cursor by one record at the level. * For nonzero levels the leaf-ward information is untouched. */ int /* error */ xfs_btree_decrement( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_btree_block *block; struct xfs_buf *bp; int error; /* error return value */ int lev; union xfs_btree_ptr ptr; ASSERT(level < cur->bc_nlevels); /* Read-ahead to the left at this level. */ xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); /* We're done if we remain in the block after the decrement. */ if (--cur->bc_levels[level].ptr > 0) goto out1; /* Get a pointer to the btree block. */ block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* Fail if we just went off the left edge of the tree. */ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); if (xfs_btree_ptr_is_null(cur, &ptr)) goto out0; XFS_BTREE_STATS_INC(cur, decrement); /* * March up the tree decrementing pointers. * Stop when we don't go off the left edge of a block. */ for (lev = level + 1; lev < cur->bc_nlevels; lev++) { if (--cur->bc_levels[lev].ptr > 0) break; /* Read-ahead the left block for the next loop. */ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); } /* * If we went off the root then we are seriously confused. * or the root of the tree is in an inode. */ if (lev == cur->bc_nlevels) { if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } ASSERT(lev < cur->bc_nlevels); /* * Now walk back down the tree, fixing up the cursor's buffer * pointers and key numbers. */ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); } out1: *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; } /* * Check the btree block owner now that we have the context to know who the * real owner is. */ static inline xfs_failaddr_t xfs_btree_check_block_owner( struct xfs_btree_cur *cur, struct xfs_btree_block *block) { __u64 owner; if (!xfs_has_crc(cur->bc_mp) || (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) return NULL; owner = xfs_btree_owner(cur); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) return __this_address; } else { if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) return __this_address; } return NULL; } int xfs_btree_lookup_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in the btree */ const union xfs_btree_ptr *pp, /* ptr to btree block */ struct xfs_btree_block **blkp) /* return btree block */ { struct xfs_buf *bp; /* buffer pointer for btree block */ xfs_daddr_t daddr; int error = 0; /* special case the root block if in an inode */ if (xfs_btree_at_iroot(cur, level)) { *blkp = xfs_btree_get_iroot(cur); return 0; } /* * If the old buffer at this level for the disk address we are * looking for re-use it. * * Otherwise throw it away and get a new one. */ bp = cur->bc_levels[level].bp; error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); if (error) return error; if (bp && xfs_buf_daddr(bp) == daddr) { *blkp = XFS_BUF_TO_BLOCK(bp); return 0; } error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); if (error) return error; /* Check the inode owner since the verifiers don't. */ if (xfs_btree_check_block_owner(cur, *blkp) != NULL) goto out_bad; /* Did we get the level we were looking for? */ if (be16_to_cpu((*blkp)->bb_level) != level) goto out_bad; /* Check that internal nodes have at least one record. */ if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) goto out_bad; xfs_btree_setbuf(cur, level, bp); return 0; out_bad: *blkp = NULL; xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } /* * Get current search key. For level 0 we don't actually have a key * structure so we make one up from the record. For all other levels * we just return the right key. */ STATIC union xfs_btree_key * xfs_lookup_get_search_key( struct xfs_btree_cur *cur, int level, int keyno, struct xfs_btree_block *block, union xfs_btree_key *kp) { if (level == 0) { cur->bc_ops->init_key_from_rec(kp, xfs_btree_rec_addr(cur, keyno, block)); return kp; } return xfs_btree_key_addr(cur, keyno, block); } /* * Initialize a pointer to the root block. */ void xfs_btree_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * Inode-rooted btrees call xfs_btree_get_iroot to find the root * in xfs_btree_lookup_get_block and don't need a pointer here. */ ptr->l = 0; } else if (cur->bc_flags & XFS_BTREE_STAGING) { ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); } else { cur->bc_ops->init_ptr_from_cur(cur, ptr); } } /* * Lookup the record. The cursor is made to point to it, based on dir. * stat is set to 0 if can't find any such record, 1 for success. */ int /* error */ xfs_btree_lookup( struct xfs_btree_cur *cur, /* btree cursor */ xfs_lookup_t dir, /* <=, ==, or >= */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* current btree block */ int64_t diff; /* difference for the current key */ int error; /* error return value */ int keyno; /* current key number */ int level; /* level in the btree */ union xfs_btree_ptr *pp; /* ptr to btree block */ union xfs_btree_ptr ptr; /* ptr to btree block */ XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } block = NULL; keyno = 0; /* initialise start pointer from cursor */ xfs_btree_init_ptr_from_cur(cur, &ptr); pp = &ptr; /* * Iterate over each level in the btree, starting at the root. * For each level above the leaves, find the key we need, based * on the lookup record, then follow the corresponding block * pointer down to the next level. */ for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { /* Get the block we need to do the lookup on. */ error = xfs_btree_lookup_get_block(cur, level, pp, &block); if (error) goto error0; if (diff == 0) { /* * If we already had a key match at a higher level, we * know we need to use the first entry in this block. */ keyno = 1; } else { /* Otherwise search this block. Do a binary search. */ int high; /* high entry number */ int low; /* low entry number */ /* Set low and high entry numbers, 1-based. */ low = 1; high = xfs_btree_get_numrecs(block); if (!high) { /* Block is empty, must be an empty leaf. */ if (level != 0 || cur->bc_nlevels != 1) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, cur->bc_mp, block, sizeof(*block)); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; *stat = 0; return 0; } /* Binary search the block. */ while (low <= high) { union xfs_btree_key key; union xfs_btree_key *kp; XFS_BTREE_STATS_INC(cur, compare); /* keyno is average of low and high. */ keyno = (low + high) >> 1; /* Get current search key */ kp = xfs_lookup_get_search_key(cur, level, keyno, block, &key); /* * Compute difference to get next direction: * - less than, move right * - greater than, move left * - equal, we're done */ diff = cur->bc_ops->key_diff(cur, kp); if (diff < 0) low = keyno + 1; else if (diff > 0) high = keyno - 1; else break; } } /* * If there are more levels, set up for the next level * by getting the block number and filling in the cursor. */ if (level > 0) { /* * If we moved left, need the previous key number, * unless there isn't one. */ if (diff > 0 && --keyno < 1) keyno = 1; pp = xfs_btree_ptr_addr(cur, keyno, block); error = xfs_btree_debug_check_ptr(cur, pp, 0, level); if (error) goto error0; cur->bc_levels[level].ptr = keyno; } } /* Done with the search. See if we need to adjust the results. */ if (dir != XFS_LOOKUP_LE && diff < 0) { keyno++; /* * If ge search and we went off the end of the block, but it's * not the last block, we're in the wrong block. */ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); if (dir == XFS_LOOKUP_GE && keyno > xfs_btree_get_numrecs(block) && !xfs_btree_ptr_is_null(cur, &ptr)) { int i; cur->bc_levels[0].ptr = keyno; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } *stat = 1; return 0; } } else if (dir == XFS_LOOKUP_LE && diff > 0) keyno--; cur->bc_levels[0].ptr = keyno; /* Return if we succeeded or not. */ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) *stat = 0; else if (dir != XFS_LOOKUP_EQ || diff == 0) *stat = 1; else *stat = 0; return 0; error0: return error; } /* Find the high key storage area from a regular key. */ union xfs_btree_key * xfs_btree_high_key_from_key( struct xfs_btree_cur *cur, union xfs_btree_key *key) { ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); return (union xfs_btree_key *)((char *)key + (cur->bc_ops->key_len / 2)); } /* Determine the low (and high if overlapped) keys of a leaf block */ STATIC void xfs_btree_get_leaf_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key) { union xfs_btree_key max_hkey; union xfs_btree_key hkey; union xfs_btree_rec *rec; union xfs_btree_key *high; int n; rec = xfs_btree_rec_addr(cur, 1, block); cur->bc_ops->init_key_from_rec(key, rec); if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { rec = xfs_btree_rec_addr(cur, n, block); cur->bc_ops->init_high_key_from_rec(&hkey, rec); if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) max_hkey = hkey; } high = xfs_btree_high_key_from_key(cur, key); memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); } } /* Determine the low (and high if overlapped) keys of a node block */ STATIC void xfs_btree_get_node_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key) { union xfs_btree_key *hkey; union xfs_btree_key *max_hkey; union xfs_btree_key *high; int n; if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { memcpy(key, xfs_btree_key_addr(cur, 1, block), cur->bc_ops->key_len / 2); max_hkey = xfs_btree_high_key_addr(cur, 1, block); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { hkey = xfs_btree_high_key_addr(cur, n, block); if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) max_hkey = hkey; } high = xfs_btree_high_key_from_key(cur, key); memcpy(high, max_hkey, cur->bc_ops->key_len / 2); } else { memcpy(key, xfs_btree_key_addr(cur, 1, block), cur->bc_ops->key_len); } } /* Derive the keys for any btree block. */ void xfs_btree_get_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key) { if (be16_to_cpu(block->bb_level) == 0) xfs_btree_get_leaf_keys(cur, block, key); else xfs_btree_get_node_keys(cur, block, key); } /* * Decide if we need to update the parent keys of a btree block. For * a standard btree this is only necessary if we're updating the first * record/key. For an overlapping btree, we must always update the * keys because the highest key can be in any of the records or keys * in the block. */ static inline bool xfs_btree_needs_key_update( struct xfs_btree_cur *cur, int ptr) { return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; } /* * Update the low and high parent keys of the given level, progressing * towards the root. If force_all is false, stop if the keys for a given * level do not need updating. */ STATIC int __xfs_btree_updkeys( struct xfs_btree_cur *cur, int level, struct xfs_btree_block *block, struct xfs_buf *bp0, bool force_all) { union xfs_btree_key key; /* keys from current level */ union xfs_btree_key *lkey; /* keys from the next level up */ union xfs_btree_key *hkey; union xfs_btree_key *nlkey; /* keys from the next level up */ union xfs_btree_key *nhkey; struct xfs_buf *bp; int ptr; ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); /* Exit if there aren't any parent levels to update. */ if (level + 1 >= cur->bc_nlevels) return 0; trace_xfs_btree_updkeys(cur, level, bp0); lkey = &key; hkey = xfs_btree_high_key_from_key(cur, lkey); xfs_btree_get_keys(cur, block, lkey); for (level++; level < cur->bc_nlevels; level++) { #ifdef DEBUG int error; #endif block = xfs_btree_get_block(cur, level, &bp); trace_xfs_btree_updkeys(cur, level, bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) return error; #endif ptr = cur->bc_levels[level].ptr; nlkey = xfs_btree_key_addr(cur, ptr, block); nhkey = xfs_btree_high_key_addr(cur, ptr, block); if (!force_all && xfs_btree_keycmp_eq(cur, nlkey, lkey) && xfs_btree_keycmp_eq(cur, nhkey, hkey)) break; xfs_btree_copy_keys(cur, nlkey, lkey, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); if (level + 1 >= cur->bc_nlevels) break; xfs_btree_get_node_keys(cur, block, lkey); } return 0; } /* Update all the keys from some level in cursor back to the root. */ STATIC int xfs_btree_updkeys_force( struct xfs_btree_cur *cur, int level) { struct xfs_buf *bp; struct xfs_btree_block *block; block = xfs_btree_get_block(cur, level, &bp); return __xfs_btree_updkeys(cur, level, block, bp, true); } /* * Update the parent keys of the given level, progressing towards the root. */ STATIC int xfs_btree_update_keys( struct xfs_btree_cur *cur, int level) { struct xfs_btree_block *block; struct xfs_buf *bp; union xfs_btree_key *kp; union xfs_btree_key key; int ptr; ASSERT(level >= 0); block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) return __xfs_btree_updkeys(cur, level, block, bp, false); /* * Go up the tree from this level toward the root. * At each level, update the key value to the value input. * Stop when we reach a level where the cursor isn't pointing * at the first entry in the block. */ xfs_btree_get_keys(cur, block, &key); for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { #ifdef DEBUG int error; #endif block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) return error; #endif ptr = cur->bc_levels[level].ptr; kp = xfs_btree_key_addr(cur, ptr, block); xfs_btree_copy_keys(cur, kp, &key, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); } return 0; } /* * Update the record referred to by cur to the value in the * given record. This either works (return 0) or gets an * EFSCORRUPTED error. */ int xfs_btree_update( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { struct xfs_btree_block *block; struct xfs_buf *bp; int error; int ptr; union xfs_btree_rec *rp; /* Pick up the current block. */ block = xfs_btree_get_block(cur, 0, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, 0, bp); if (error) goto error0; #endif /* Get the address of the rec to be updated. */ ptr = cur->bc_levels[0].ptr; rp = xfs_btree_rec_addr(cur, ptr, block); /* Fill in the new contents and log them. */ xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_log_recs(cur, bp, ptr, ptr); /* Pass new key value up to our parent. */ if (xfs_btree_needs_key_update(cur, ptr)) { error = xfs_btree_update_keys(cur, 0); if (error) goto error0; } return 0; error0: return error; } /* * Move 1 record left from cur/level if possible. * Update cur to reflect the new path. */ STATIC int /* error */ xfs_btree_lshift( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ int lrecs; /* left record count */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ struct xfs_btree_cur *tcur; /* temporary btree cursor */ int rrecs; /* right record count */ union xfs_btree_ptr lptr; /* left btree pointer */ union xfs_btree_key *rkp = NULL; /* right btree key */ union xfs_btree_ptr *rpp = NULL; /* right address pointer */ union xfs_btree_rec *rrp = NULL; /* right record pointer */ int error; /* error return value */ int i; if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "right". */ right = xfs_btree_get_block(cur, level, &rbp); #ifdef DEBUG error = xfs_btree_check_block(cur, right, level, rbp); if (error) goto error0; #endif /* If we've got no left sibling then we can't shift an entry left. */ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); if (xfs_btree_ptr_is_null(cur, &lptr)) goto out0; /* * If the cursor entry is the one that would be moved, don't * do it... it's too complicated. */ if (cur->bc_levels[level].ptr <= 1) goto out0; /* Set up the left neighbor as "left". */ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; /* If it's full, it can't take another entry. */ lrecs = xfs_btree_get_numrecs(left); if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) goto out0; rrecs = xfs_btree_get_numrecs(right); /* * We add one entry to the left side and remove one for the right side. * Account for it here, the changes will be updated on disk and logged * later. */ lrecs++; rrecs--; XFS_BTREE_STATS_INC(cur, lshift); XFS_BTREE_STATS_ADD(cur, moves, 1); /* * If non-leaf, copy a key and a ptr to the left block. * Log the changes to the left block. */ if (level > 0) { /* It's a non-leaf. Move keys and pointers. */ union xfs_btree_key *lkp; /* left btree key */ union xfs_btree_ptr *lpp; /* left address pointer */ lkp = xfs_btree_key_addr(cur, lrecs, left); rkp = xfs_btree_key_addr(cur, 1, right); lpp = xfs_btree_ptr_addr(cur, lrecs, left); rpp = xfs_btree_ptr_addr(cur, 1, right); error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); if (error) goto error0; xfs_btree_copy_keys(cur, lkp, rkp, 1); xfs_btree_copy_ptrs(cur, lpp, rpp, 1); xfs_btree_log_keys(cur, lbp, lrecs, lrecs); xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); ASSERT(cur->bc_ops->keys_inorder(cur, xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); } else { /* It's a leaf. Move records. */ union xfs_btree_rec *lrp; /* left record pointer */ lrp = xfs_btree_rec_addr(cur, lrecs, left); rrp = xfs_btree_rec_addr(cur, 1, right); xfs_btree_copy_recs(cur, lrp, rrp, 1); xfs_btree_log_recs(cur, lbp, lrecs, lrecs); ASSERT(cur->bc_ops->recs_inorder(cur, xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); } xfs_btree_set_numrecs(left, lrecs); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); xfs_btree_set_numrecs(right, rrecs); xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); /* * Slide the contents of right down one entry. */ XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ for (i = 0; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) goto error0; } xfs_btree_shift_keys(cur, xfs_btree_key_addr(cur, 2, right), -1, rrecs); xfs_btree_shift_ptrs(cur, xfs_btree_ptr_addr(cur, 2, right), -1, rrecs); xfs_btree_log_keys(cur, rbp, 1, rrecs); xfs_btree_log_ptrs(cur, rbp, 1, rrecs); } else { /* It's a leaf. operate on records */ xfs_btree_shift_recs(cur, xfs_btree_rec_addr(cur, 2, right), -1, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); } /* * Using a temporary cursor, update the parent key values of the * block on the left. */ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error1; /* Update the parent high keys of the left block, if needed. */ error = xfs_btree_update_keys(tcur, level); if (error) goto error1; xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); } /* Update the parent keys of the right block. */ error = xfs_btree_update_keys(cur, level); if (error) goto error0; /* Slide the cursor value left one. */ cur->bc_levels[level].ptr--; *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } /* * Move 1 record right from cur/level if possible. * Update cur to reflect the new path. */ STATIC int /* error */ xfs_btree_rshift( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ struct xfs_btree_cur *tcur; /* temporary btree cursor */ union xfs_btree_ptr rptr; /* right block pointer */ union xfs_btree_key *rkp; /* right btree key */ int rrecs; /* right record count */ int lrecs; /* left record count */ int error; /* error return value */ int i; /* loop counter */ if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "left". */ left = xfs_btree_get_block(cur, level, &lbp); #ifdef DEBUG error = xfs_btree_check_block(cur, left, level, lbp); if (error) goto error0; #endif /* If we've got no right sibling then we can't shift an entry right. */ xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); if (xfs_btree_ptr_is_null(cur, &rptr)) goto out0; /* * If the cursor entry is the one that would be moved, don't * do it... it's too complicated. */ lrecs = xfs_btree_get_numrecs(left); if (cur->bc_levels[level].ptr >= lrecs) goto out0; /* Set up the right neighbor as "right". */ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; /* If it's full, it can't take another entry. */ rrecs = xfs_btree_get_numrecs(right); if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) goto out0; XFS_BTREE_STATS_INC(cur, rshift); XFS_BTREE_STATS_ADD(cur, moves, rrecs); /* * Make a hole at the start of the right neighbor block, then * copy the last left block entry to the hole. */ if (level > 0) { /* It's a nonleaf. make a hole in the keys and ptrs */ union xfs_btree_key *lkp; union xfs_btree_ptr *lpp; union xfs_btree_ptr *rpp; lkp = xfs_btree_key_addr(cur, lrecs, left); lpp = xfs_btree_ptr_addr(cur, lrecs, left); rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); for (i = rrecs - 1; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, rpp, i, level); if (error) goto error0; } xfs_btree_shift_keys(cur, rkp, 1, rrecs); xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); if (error) goto error0; /* Now put the new data in, and log it. */ xfs_btree_copy_keys(cur, rkp, lkp, 1); xfs_btree_copy_ptrs(cur, rpp, lpp, 1); xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); ASSERT(cur->bc_ops->keys_inorder(cur, rkp, xfs_btree_key_addr(cur, 2, right))); } else { /* It's a leaf. make a hole in the records */ union xfs_btree_rec *lrp; union xfs_btree_rec *rrp; lrp = xfs_btree_rec_addr(cur, lrecs, left); rrp = xfs_btree_rec_addr(cur, 1, right); xfs_btree_shift_recs(cur, rrp, 1, rrecs); /* Now put the new data in, and log it. */ xfs_btree_copy_recs(cur, rrp, lrp, 1); xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); } /* * Decrement and log left's numrecs, bump and log right's numrecs. */ xfs_btree_set_numrecs(left, --lrecs); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); xfs_btree_set_numrecs(right, ++rrecs); xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); /* * Using a temporary cursor, update the parent key values of the * block on the right. */ error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_increment(tcur, level, &i); if (error) goto error1; /* Update the parent high keys of the left block, if needed. */ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error1; } /* Update the parent keys of the right block. */ error = xfs_btree_update_keys(tcur, level); if (error) goto error1; xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } static inline int xfs_btree_alloc_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *hint_block, union xfs_btree_ptr *new_block, int *stat) { int error; /* * Don't allow block allocation for a staging cursor, because staging * cursors do not support regular btree modifications. * * Bulk loading uses a separate callback to obtain new blocks from a * preallocated list, which prevents ENOSPC failures during loading. */ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { ASSERT(0); return -EFSCORRUPTED; } error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); trace_xfs_btree_alloc_block(cur, new_block, *stat, error); return error; } /* * Split cur/level block in half. * Return new block number and the key to its first * record (to be inserted into parent). */ STATIC int /* error */ __xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, union xfs_btree_key *key, struct xfs_btree_cur **curp, int *stat) /* success/failure */ { union xfs_btree_ptr lptr; /* left sibling block ptr */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ union xfs_btree_ptr rptr; /* right sibling block ptr */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ union xfs_btree_ptr rrptr; /* right-right sibling ptr */ struct xfs_buf *rrbp; /* right-right buffer pointer */ struct xfs_btree_block *rrblock; /* right-right btree block */ int lrecs; int rrecs; int src_index; int error; /* error return value */ int i; XFS_BTREE_STATS_INC(cur, split); /* Set up left block (current one). */ left = xfs_btree_get_block(cur, level, &lbp); #ifdef DEBUG error = xfs_btree_check_block(cur, left, level, lbp); if (error) goto error0; #endif xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) goto out0; XFS_BTREE_STATS_INC(cur, alloc); /* Set up the new block as "right". */ error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); if (error) goto error0; /* Fill in the btree header for the new right block. */ xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); /* * Split the entries between the old and the new block evenly. * Make sure that if there's an odd number of entries now, that * each new block will have the same number of entries. */ lrecs = xfs_btree_get_numrecs(left); rrecs = lrecs / 2; if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) rrecs++; src_index = (lrecs - rrecs + 1); XFS_BTREE_STATS_ADD(cur, moves, rrecs); /* Adjust numrecs for the later get_*_keys() calls. */ lrecs -= rrecs; xfs_btree_set_numrecs(left, lrecs); xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); /* * Copy btree block entries from the left block over to the * new block, the right. Update the right block and log the * changes. */ if (level > 0) { /* It's a non-leaf. Move keys and pointers. */ union xfs_btree_key *lkp; /* left btree key */ union xfs_btree_ptr *lpp; /* left address pointer */ union xfs_btree_key *rkp; /* right btree key */ union xfs_btree_ptr *rpp; /* right address pointer */ lkp = xfs_btree_key_addr(cur, src_index, left); lpp = xfs_btree_ptr_addr(cur, src_index, left); rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); for (i = src_index; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, lpp, i, level); if (error) goto error0; } /* Copy the keys & pointers to the new block. */ xfs_btree_copy_keys(cur, rkp, lkp, rrecs); xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); xfs_btree_log_keys(cur, rbp, 1, rrecs); xfs_btree_log_ptrs(cur, rbp, 1, rrecs); /* Stash the keys of the new block for later insertion. */ xfs_btree_get_node_keys(cur, right, key); } else { /* It's a leaf. Move records. */ union xfs_btree_rec *lrp; /* left record pointer */ union xfs_btree_rec *rrp; /* right record pointer */ lrp = xfs_btree_rec_addr(cur, src_index, left); rrp = xfs_btree_rec_addr(cur, 1, right); /* Copy records to the new block. */ xfs_btree_copy_recs(cur, rrp, lrp, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); /* Stash the keys of the new block for later insertion. */ xfs_btree_get_leaf_keys(cur, right, key); } /* * Find the left block number by looking in the buffer. * Adjust sibling pointers. */ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); /* * If there's a block to the new block's right, make that block * point back to right instead of to left. */ if (!xfs_btree_ptr_is_null(cur, &rrptr)) { error = xfs_btree_read_buf_block(cur, &rrptr, 0, &rrblock, &rrbp); if (error) goto error0; xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); } /* Update the parent high keys of the left block, if needed. */ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; } /* * If the cursor is really in the right block, move it there. * If it's just pointing past the last entry in left, then we'll * insert there, so don't change anything in that case. */ if (cur->bc_levels[level].ptr > lrecs + 1) { xfs_btree_setbuf(cur, level, rbp); cur->bc_levels[level].ptr -= lrecs; } /* * If there are more levels, we'll need another cursor which refers * the right block, no matter where this cursor was. */ if (level + 1 < cur->bc_nlevels) { error = xfs_btree_dup_cursor(cur, curp); if (error) goto error0; (*curp)->bc_levels[level + 1].ptr++; } *ptrp = rptr; *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; } #ifdef __KERNEL__ struct xfs_btree_split_args { struct xfs_btree_cur *cur; int level; union xfs_btree_ptr *ptrp; union xfs_btree_key *key; struct xfs_btree_cur **curp; int *stat; /* success/failure */ int result; bool kswapd; /* allocation in kswapd context */ struct completion *done; struct work_struct work; }; /* * Stack switching interfaces for allocation */ static void xfs_btree_split_worker( struct work_struct *work) { struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; unsigned long new_pflags = 0; /* * we are in a transaction context here, but may also be doing work * in kswapd context, and hence we may need to inherit that state * temporarily to ensure that we don't block waiting for memory reclaim * in any way. */ if (args->kswapd) new_pflags |= PF_MEMALLOC | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); xfs_trans_set_context(args->cur->bc_tp); args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, args->key, args->curp, args->stat); xfs_trans_clear_context(args->cur->bc_tp); current_restore_flags_nested(&pflags, new_pflags); /* * Do not access args after complete() has run here. We don't own args * and the owner may run and free args before we return here. */ complete(args->done); } /* * BMBT split requests often come in with little stack to work on so we push * them off to a worker thread so there is lots of stack to use. For the other * btree types, just call directly to avoid the context switch overhead here. * * Care must be taken here - the work queue rescuer thread introduces potential * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new * AGFs to allocate blocks. A task being run by the rescuer could attempt to * lock an AGF that is already locked by a task queued to run by the rescuer, * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to * release it until the current thread it is running gains the lock. * * To avoid this issue, we only ever queue BMBT splits that don't have an AGF * already locked to allocate from. The only place that doesn't hold an AGF * locked is unwritten extent conversion at IO completion, but that has already * been offloaded to a worker thread and hence has no stack consumption issues * we have to worry about. */ STATIC int /* error */ xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, union xfs_btree_key *key, struct xfs_btree_cur **curp, int *stat) /* success/failure */ { struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); if (!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_tp->t_highest_agno == NULLAGNUMBER) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); args.cur = cur; args.level = level; args.ptrp = ptrp; args.key = key; args.curp = curp; args.stat = stat; args.done = &done; args.kswapd = current_is_kswapd(); INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); queue_work(xfs_alloc_wq, &args.work); wait_for_completion(&done); destroy_work_on_stack(&args.work); return args.result; } #else #define xfs_btree_split __xfs_btree_split #endif /* __KERNEL__ */ /* * Copy the old inode root contents into a real block and make the * broot point to it. */ int /* error */ xfs_btree_new_iroot( struct xfs_btree_cur *cur, /* btree cursor */ int *logflags, /* logging flags for inode */ int *stat) /* return status - 0 fail */ { struct xfs_buf *cbp; /* buffer for cblock */ struct xfs_btree_block *block; /* btree block */ struct xfs_btree_block *cblock; /* child btree block */ union xfs_btree_key *ckp; /* child key pointer */ union xfs_btree_ptr *cpp; /* child ptr pointer */ union xfs_btree_key *kp; /* pointer to btree key */ union xfs_btree_ptr *pp; /* pointer to block addr */ union xfs_btree_ptr nptr; /* new block addr */ int level; /* btree level */ int error; /* error return code */ int i; /* loop counter */ XFS_BTREE_STATS_INC(cur, newroot); ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); level = cur->bc_nlevels - 1; block = xfs_btree_get_iroot(cur); pp = xfs_btree_ptr_addr(cur, 1, block); /* Allocate the new block. If we can't do it, we're toast. Give up. */ error = xfs_btree_alloc_block(cur, pp, &nptr, stat); if (error) goto error0; if (*stat == 0) return 0; XFS_BTREE_STATS_INC(cur, alloc); /* Copy the root into a real block. */ error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); if (error) goto error0; /* * we can't just memcpy() the root in for CRC enabled btree blocks. * In that case have to also ensure the blkno remains correct */ memcpy(cblock, block, xfs_btree_block_len(cur)); if (xfs_has_crc(cur->bc_mp)) { __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) cblock->bb_u.l.bb_blkno = bno; else cblock->bb_u.s.bb_blkno = bno; } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); cur->bc_nlevels++; ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); cur->bc_levels[level + 1].ptr = 1; kp = xfs_btree_key_addr(cur, 1, block); ckp = xfs_btree_key_addr(cur, 1, cblock); xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); cpp = xfs_btree_ptr_addr(cur, 1, cblock); for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) goto error0; } xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); if (error) goto error0; xfs_btree_copy_ptrs(cur, pp, &nptr, 1); xfs_iroot_realloc(cur->bc_ino.ip, 1 - xfs_btree_get_numrecs(cblock), cur->bc_ino.whichfork); xfs_btree_setbuf(cur, level, cbp); /* * Do all this logging at the end so that * the root is at the right level. */ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: return error; } static void xfs_btree_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int inc) { if (cur->bc_flags & XFS_BTREE_STAGING) { /* Update the btree root information for a per-AG fake root. */ cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); cur->bc_ag.afake->af_levels += inc; } else { cur->bc_ops->set_root(cur, ptr, inc); } } /* * Allocate a new root block, fill it in. */ STATIC int /* error */ xfs_btree_new_root( struct xfs_btree_cur *cur, /* btree cursor */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* one half of the old root block */ struct xfs_buf *bp; /* buffer containing block */ int error; /* error return value */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ struct xfs_buf *nbp; /* new (root) buffer */ struct xfs_btree_block *new; /* new (root) btree block */ int nptr; /* new value for key index, 1 or 2 */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ union xfs_btree_ptr rptr; union xfs_btree_ptr lptr; XFS_BTREE_STATS_INC(cur, newroot); /* initialise our start point from the cursor */ xfs_btree_init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) goto out0; XFS_BTREE_STATS_INC(cur, alloc); /* Set up the new block. */ error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); if (error) goto error0; /* Set the root in the holding structure increasing the level by 1. */ xfs_btree_set_root(cur, &lptr, 1); /* * At the previous root level there are now two blocks: the old root, * and the new block generated when it was split. We don't know which * one the cursor is pointing at, so we set up variables "left" and * "right" for each case. */ block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); if (error) goto error0; #endif xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); if (!xfs_btree_ptr_is_null(cur, &rptr)) { /* Our block is left, pick up the right block. */ lbp = bp; xfs_btree_buf_to_ptr(cur, lbp, &lptr); left = block; error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; bp = rbp; nptr = 1; } else { /* Our block is right, pick up the left block. */ rbp = bp; xfs_btree_buf_to_ptr(cur, rbp, &rptr); right = block; xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; bp = lbp; nptr = 2; } /* Fill in the new block's btree header and log it. */ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); /* Fill in the key data in the new root. */ if (xfs_btree_get_level(left) > 0) { /* * Get the keys for the left block's keys and put them directly * in the parent block. Do the same for the right block. */ xfs_btree_get_node_keys(cur, left, xfs_btree_key_addr(cur, 1, new)); xfs_btree_get_node_keys(cur, right, xfs_btree_key_addr(cur, 2, new)); } else { /* * Get the keys for the left block's records and put them * directly in the parent block. Do the same for the right * block. */ xfs_btree_get_leaf_keys(cur, left, xfs_btree_key_addr(cur, 1, new)); xfs_btree_get_leaf_keys(cur, right, xfs_btree_key_addr(cur, 2, new)); } xfs_btree_log_keys(cur, nbp, 1, 2); /* Fill in the pointer data in the new root. */ xfs_btree_copy_ptrs(cur, xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); xfs_btree_copy_ptrs(cur, xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); xfs_btree_log_ptrs(cur, nbp, 1, 2); /* Fix up the cursor. */ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); cur->bc_levels[cur->bc_nlevels].ptr = nptr; cur->bc_nlevels++; ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); *stat = 1; return 0; error0: return error; out0: *stat = 0; return 0; } STATIC int xfs_btree_make_block_unfull( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* btree level */ int numrecs,/* # of recs in block */ int *oindex,/* old tree index */ int *index, /* new tree index */ union xfs_btree_ptr *nptr, /* new btree ptr */ struct xfs_btree_cur **ncur, /* new btree cursor */ union xfs_btree_key *key, /* key of new block */ int *stat) { int error = 0; if (xfs_btree_at_iroot(cur, level)) { struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); *stat = 1; } else { /* A root block that needs replacing */ int logflags = 0; error = xfs_btree_new_iroot(cur, &logflags, stat); if (error || *stat == 0) return error; xfs_trans_log_inode(cur->bc_tp, ip, logflags); } return 0; } /* First, try shifting an entry to the right neighbor. */ error = xfs_btree_rshift(cur, level, stat); if (error || *stat) return error; /* Next, try shifting an entry to the left neighbor. */ error = xfs_btree_lshift(cur, level, stat); if (error) return error; if (*stat) { *oindex = *index = cur->bc_levels[level].ptr; return 0; } /* * Next, try splitting the current block in half. * * If this works we have to re-set our variables because we * could be in a different block now. */ error = xfs_btree_split(cur, level, nptr, key, ncur, stat); if (error || *stat == 0) return error; *index = cur->bc_levels[level].ptr; return 0; } /* * Insert one record/level. Return information to the caller * allowing the next level up to proceed if necessary. */ STATIC int xfs_btree_insrec( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level to insert record at */ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ union xfs_btree_rec *rec, /* record to insert */ union xfs_btree_key *key, /* i/o: block key for ptrp */ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ int numrecs;/* number of records */ int error; /* error return value */ int i; xfs_daddr_t old_bn; ncur = NULL; lkey = &nkey; /* * If we have an external root pointer, and we've made it to the * root level, allocate a new root block and we're done. */ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && level >= cur->bc_nlevels) { error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); return error; } /* If we're off the left edge, return failure. */ ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; } optr = ptr; XFS_BTREE_STATS_INC(cur, insrec); /* Get pointers to the btree buffer and block. */ block = xfs_btree_get_block(cur, level, &bp); old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; /* Check that the new entry is being inserted in the right place. */ if (ptr <= numrecs) { if (level == 0) { ASSERT(cur->bc_ops->recs_inorder(cur, rec, xfs_btree_rec_addr(cur, ptr, block))); } else { ASSERT(cur->bc_ops->keys_inorder(cur, key, xfs_btree_key_addr(cur, ptr, block))); } } #endif /* * If the block is full, we can't insert the new entry until we * make the block un-full. */ xfs_btree_set_ptr_null(cur, &nptr); if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { error = xfs_btree_make_block_unfull(cur, level, numrecs, &optr, &ptr, &nptr, &ncur, lkey, stat); if (error || *stat == 0) goto error0; } /* * The current block may have changed if the block was * previously full and we have just made space in it. */ block = xfs_btree_get_block(cur, level, &bp); numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* * At this point we know there's room for our new entry in the block * we're pointing at. */ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); if (level > 0) { /* It's a nonleaf. make a hole in the keys and ptrs */ union xfs_btree_key *kp; union xfs_btree_ptr *pp; kp = xfs_btree_key_addr(cur, ptr, block); pp = xfs_btree_ptr_addr(cur, ptr, block); for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); if (error) goto error0; /* Now put the new data in, bump numrecs and log it. */ xfs_btree_copy_keys(cur, kp, key, 1); xfs_btree_copy_ptrs(cur, pp, ptrp, 1); numrecs++; xfs_btree_set_numrecs(block, numrecs); xfs_btree_log_ptrs(cur, bp, ptr, numrecs); xfs_btree_log_keys(cur, bp, ptr, numrecs); #ifdef DEBUG if (ptr < numrecs) { ASSERT(cur->bc_ops->keys_inorder(cur, kp, xfs_btree_key_addr(cur, ptr + 1, block))); } #endif } else { /* It's a leaf. make a hole in the records */ union xfs_btree_rec *rp; rp = xfs_btree_rec_addr(cur, ptr, block); xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); /* Now put the new data in, bump numrecs and log it. */ xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_set_numrecs(block, ++numrecs); xfs_btree_log_recs(cur, bp, ptr, numrecs); #ifdef DEBUG if (ptr < numrecs) { ASSERT(cur->bc_ops->recs_inorder(cur, rp, xfs_btree_rec_addr(cur, ptr + 1, block))); } #endif } /* Log the new number of records in the btree header. */ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* * Update btree keys to reflect the newly added record or keyptr. * There are three cases here to be aware of. Normally, all we have to * do is walk towards the root, updating keys as necessary. * * If the caller had us target a full block for the insertion, we dealt * with that by calling the _make_block_unfull function. If the * "make unfull" function splits the block, it'll hand us back the key * and pointer of the new block. We haven't yet added the new block to * the next level up, so if we decide to add the new record to the new * block (bp->b_bn != old_bn), we have to update the caller's pointer * so that the caller adds the new block with the correct key. * * However, there is a third possibility-- if the selected block is the * root block of an inode-rooted btree and cannot be expanded further, * the "make unfull" function moves the root block contents to a new * block and updates the root block to point to the new block. In this * case, no block pointer is passed back because the block has already * been added to the btree. In this case, we need to use the regular * key update function, just like the first case. This is critical for * overlapping btrees, because the high key must be updated to reflect * the entire tree, not just the subtree accessible through the first * child of the root (which is now two levels down from the root). */ if (!xfs_btree_ptr_is_null(cur, &nptr) && bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; } /* * Return the new block number, if any. * If there is one, give back a record value and a cursor too. */ *ptrp = nptr; if (!xfs_btree_ptr_is_null(cur, &nptr)) { xfs_btree_copy_keys(cur, key, lkey, 1); *curp = ncur; } *stat = 1; return 0; error0: if (ncur) xfs_btree_del_cursor(ncur, error); return error; } /* * Insert the record at the point referenced by cur. * * A multi-level split of the tree on insert will invalidate the original * cursor. All callers of this function should assume that the cursor is * no longer valid and revalidate it. */ int xfs_btree_insert( struct xfs_btree_cur *cur, int *stat) { int error; /* error return value */ int i; /* result value, 0 for failure */ int level; /* current level number in btree */ union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ union xfs_btree_key bkey; /* key of block to insert */ union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; key = &bkey; xfs_btree_set_ptr_null(cur, &nptr); /* Make a key out of the record data to be inserted, and save it. */ cur->bc_ops->init_rec_from_cur(cur, &rec); cur->bc_ops->init_key_from_rec(key, &rec); /* * Loop going up the tree, starting at the leaf level. * Stop when we don't get a split block, that must mean that * the insert is finished with this level. */ do { /* * Insert nrec/nptr into this level of the tree. * Note if we fail, nptr will be null. */ error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, &ncur, &i); if (error) { if (pcur != cur) xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); goto error0; } if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } level++; /* * See if the cursor we just used is trash. * Can't trash the caller's cursor, but otherwise we should * if ncur is a new cursor or we're about to be done. */ if (pcur != cur && (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { /* Save the state from the cursor before we trash it */ if (cur->bc_ops->update_cursor && !(cur->bc_flags & XFS_BTREE_STAGING)) cur->bc_ops->update_cursor(pcur, cur); cur->bc_nlevels = pcur->bc_nlevels; xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); } /* If we got a new cursor, switch to it. */ if (ncur) { pcur = ncur; ncur = NULL; } } while (!xfs_btree_ptr_is_null(cur, &nptr)); *stat = i; return 0; error0: return error; } /* * Try to merge a non-leaf block back into the inode root. * * Note: the killroot names comes from the fact that we're effectively * killing the old root block. But because we can't just delete the * inode we have to copy the single block it was pointing to into the * inode. */ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; union xfs_btree_key *kp; union xfs_btree_key *ckp; union xfs_btree_ptr *pp; union xfs_btree_ptr *cpp; struct xfs_buf *cbp; int level; int index; int numrecs; int error; #ifdef DEBUG union xfs_btree_ptr ptr; #endif int i; ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_nlevels > 1); /* * Don't deal with the root block needs to be a leaf case. * We're just going to turn the thing back into extents anyway. */ level = cur->bc_nlevels - 1; if (level == 1) goto out0; /* * Give up if the root has multiple children. */ block = xfs_btree_get_iroot(cur); if (xfs_btree_get_numrecs(block) != 1) goto out0; cblock = xfs_btree_get_block(cur, level - 1, &cbp); numrecs = xfs_btree_get_numrecs(cblock); /* * Only do this if the next level will fit. * Then the data must be copied up to the inode, * instead of freeing the root you free the next level. */ if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) goto out0; XFS_BTREE_STATS_INC(cur, killroot); #ifdef DEBUG xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); #endif index = numrecs - cur->bc_ops->get_maxrecs(cur, level); if (index) { xfs_iroot_realloc(cur->bc_ino.ip, index, cur->bc_ino.whichfork); block = ifp->if_broot; } be16_add_cpu(&block->bb_numrecs, index); ASSERT(block->bb_numrecs == cblock->bb_numrecs); kp = xfs_btree_key_addr(cur, 1, block); ckp = xfs_btree_key_addr(cur, 1, cblock); xfs_btree_copy_keys(cur, kp, ckp, numrecs); pp = xfs_btree_ptr_addr(cur, 1, block); cpp = xfs_btree_ptr_addr(cur, 1, cblock); for (i = 0; i < numrecs; i++) { error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); if (error) return error; } xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); error = xfs_btree_free_block(cur, cbp); if (error) return error; cur->bc_levels[level - 1].bp = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); cur->bc_nlevels--; out0: return 0; } /* * Kill the current root node, and replace it with it's only child node. */ STATIC int xfs_btree_kill_root( struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, union xfs_btree_ptr *newroot) { int error; XFS_BTREE_STATS_INC(cur, killroot); /* * Update the root pointer, decreasing the level by 1 and then * free the old root. */ xfs_btree_set_root(cur, newroot, -1); error = xfs_btree_free_block(cur, bp); if (error) return error; cur->bc_levels[level].bp = NULL; cur->bc_levels[level].ra = 0; cur->bc_nlevels--; return 0; } STATIC int xfs_btree_dec_cursor( struct xfs_btree_cur *cur, int level, int *stat) { int error; int i; if (level > 0) { error = xfs_btree_decrement(cur, level, &i); if (error) return error; } *stat = 1; return 0; } /* * Single level of the btree record deletion routine. * Delete record pointed to by cur/level. * Remove the record from its block then rebalance the tree. * Return 0 for error, 1 for done, 2 to go on to the next level. */ STATIC int /* error */ xfs_btree_delrec( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level removing record from */ int *stat) /* fail/done/go-on */ { struct xfs_btree_block *block; /* btree block */ union xfs_btree_ptr cptr; /* current block ptr */ struct xfs_buf *bp; /* buffer for block */ int error; /* error return value */ int i; /* loop counter */ union xfs_btree_ptr lptr; /* left sibling block ptr */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ int lrecs = 0; /* left record count */ int ptr; /* key/record index */ union xfs_btree_ptr rptr; /* right sibling block ptr */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ struct xfs_btree_block *rrblock; /* right-right btree block */ struct xfs_buf *rrbp; /* right-right buffer pointer */ int rrecs = 0; /* right record count */ struct xfs_btree_cur *tcur; /* temporary btree cursor */ int numrecs; /* temporary numrec count */ tcur = NULL; /* Get the index of the entry being deleted, check for nothing there. */ ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; } /* Get the buffer & block containing the record or key/ptr. */ block = xfs_btree_get_block(cur, level, &bp); numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* Fail if we're off the end of the block. */ if (ptr > numrecs) { *stat = 0; return 0; } XFS_BTREE_STATS_INC(cur, delrec); XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); /* Excise the entries being deleted. */ if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ union xfs_btree_key *lkp; union xfs_btree_ptr *lpp; lkp = xfs_btree_key_addr(cur, ptr + 1, block); lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); for (i = 0; i < numrecs - ptr; i++) { error = xfs_btree_debug_check_ptr(cur, lpp, i, level); if (error) goto error0; } if (ptr < numrecs) { xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); } } else { /* It's a leaf. operate on records */ if (ptr < numrecs) { xfs_btree_shift_recs(cur, xfs_btree_rec_addr(cur, ptr + 1, block), -1, numrecs - ptr); xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); } } /* * Decrement and log the number of entries in the block. */ xfs_btree_set_numrecs(block, --numrecs); xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* * We're at the root level. First, shrink the root block in-memory. * Try to get rid of the next level down. If we can't then there's * nothing left to do. */ if (xfs_btree_at_iroot(cur, level)) { xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); error = xfs_btree_kill_iroot(cur); if (error) goto error0; error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; *stat = 1; return 0; } /* * If this is the root level, and there's only one entry left, and it's * NOT the leaf level, then we can get rid of this level. */ if (level == cur->bc_nlevels - 1) { if (numrecs == 1 && level > 0) { union xfs_btree_ptr *pp; /* * pp is still set to the first pointer in the block. * Make it the new root of the btree. */ pp = xfs_btree_ptr_addr(cur, 1, block); error = xfs_btree_kill_root(cur, bp, level, pp); if (error) goto error0; } else if (level > 0) { error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; } *stat = 1; return 0; } /* * If we deleted the leftmost entry in the block, update the * key values above us in the tree. */ if (xfs_btree_needs_key_update(cur, ptr)) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; } /* * If the number of records remaining in the block is at least * the minimum, we're done. */ if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; return 0; } /* * Otherwise, we have to move some records around to keep the * tree balanced. Look at the left and right sibling blocks to * see if we can re-balance by moving only one record. */ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * One child of root, need to get a chance to copy its contents * into the root and delete it. Can't go up to next level, * there's nothing to delete there. */ if (xfs_btree_ptr_is_null(cur, &rptr) && xfs_btree_ptr_is_null(cur, &lptr) && level == cur->bc_nlevels - 2) { error = xfs_btree_kill_iroot(cur); if (!error) error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; return 0; } } ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || !xfs_btree_ptr_is_null(cur, &lptr)); /* * Duplicate the cursor so our btree manipulations here won't * disrupt the next level up. */ error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; /* * If there's a right sibling, see if it's ok to shift an entry * out of it. */ if (!xfs_btree_ptr_is_null(cur, &rptr)) { /* * Move the temp cursor to the last entry in the next block. * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); #ifdef DEBUG error = xfs_btree_check_block(tcur, right, level, rbp); if (error) goto error0; #endif /* Grab the current block number, for future use. */ xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); /* * If right block is full enough so that removing one entry * won't make it too empty, and left-shifting an entry out * of right to us works, we're done. */ if (xfs_btree_get_numrecs(right) - 1 >= cur->bc_ops->get_minrecs(tcur, level)) { error = xfs_btree_lshift(tcur, level, &i); if (error) goto error0; if (i) { ASSERT(xfs_btree_get_numrecs(block) >= cur->bc_ops->get_minrecs(tcur, level)); xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); tcur = NULL; error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; return 0; } } /* * Otherwise, grab the number of records in right for * future reference, and fix up the temp cursor to point * to our block again (last record). */ rrecs = xfs_btree_get_numrecs(right); if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } } } /* * If there's a left sibling, see if it's ok to shift an entry * out of it. */ if (!xfs_btree_ptr_is_null(cur, &lptr)) { /* * Move the temp cursor to the first entry in the * previous block. */ i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } /* Grab a pointer to the block. */ left = xfs_btree_get_block(tcur, level, &lbp); #ifdef DEBUG error = xfs_btree_check_block(cur, left, level, lbp); if (error) goto error0; #endif /* Grab the current block number, for future use. */ xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); /* * If left block is full enough so that removing one entry * won't make it too empty, and right-shifting an entry out * of left to us works, we're done. */ if (xfs_btree_get_numrecs(left) - 1 >= cur->bc_ops->get_minrecs(tcur, level)) { error = xfs_btree_rshift(tcur, level, &i); if (error) goto error0; if (i) { ASSERT(xfs_btree_get_numrecs(block) >= cur->bc_ops->get_minrecs(tcur, level)); xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); tcur = NULL; if (level == 0) cur->bc_levels[0].ptr++; *stat = 1; return 0; } } /* * Otherwise, grab the number of records in right for * future reference. */ lrecs = xfs_btree_get_numrecs(left); } /* Delete the temp cursor, we're done with it. */ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); tcur = NULL; /* If here, we need to do a join to keep the tree balanced. */ ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); if (!xfs_btree_ptr_is_null(cur, &lptr) && lrecs + xfs_btree_get_numrecs(block) <= cur->bc_ops->get_maxrecs(cur, level)) { /* * Set "right" to be the starting block, * "left" to be the left neighbor. */ rptr = cptr; right = block; rbp = bp; error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; /* * If that won't work, see if we can join with the right neighbor block. */ } else if (!xfs_btree_ptr_is_null(cur, &rptr) && rrecs + xfs_btree_get_numrecs(block) <= cur->bc_ops->get_maxrecs(cur, level)) { /* * Set "left" to be the starting block, * "right" to be the right neighbor. */ lptr = cptr; left = block; lbp = bp; error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; /* * Otherwise, we can't fix the imbalance. * Just return. This is probably a logic error, but it's not fatal. */ } else { error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; return 0; } rrecs = xfs_btree_get_numrecs(right); lrecs = xfs_btree_get_numrecs(left); /* * We're now going to join "left" and "right" by moving all the stuff * in "right" to "left" and deleting "right". */ XFS_BTREE_STATS_ADD(cur, moves, rrecs); if (level > 0) { /* It's a non-leaf. Move keys and pointers. */ union xfs_btree_key *lkp; /* left btree key */ union xfs_btree_ptr *lpp; /* left address pointer */ union xfs_btree_key *rkp; /* right btree key */ union xfs_btree_ptr *rpp; /* right address pointer */ lkp = xfs_btree_key_addr(cur, lrecs + 1, left); lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); for (i = 1; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i, level); if (error) goto error0; } xfs_btree_copy_keys(cur, lkp, rkp, rrecs); xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); } else { /* It's a leaf. Move records. */ union xfs_btree_rec *lrp; /* left record pointer */ union xfs_btree_rec *rrp; /* right record pointer */ lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); rrp = xfs_btree_rec_addr(cur, 1, right); xfs_btree_copy_recs(cur, lrp, rrp, rrecs); xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); } XFS_BTREE_STATS_INC(cur, join); /* * Fix up the number of records and right block pointer in the * surviving block, and log it. */ xfs_btree_set_numrecs(left, lrecs + rrecs); xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); /* If there is a right sibling, point it to the remaining block. */ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); if (!xfs_btree_ptr_is_null(cur, &cptr)) { error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); if (error) goto error0; xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); } /* Free the deleted block. */ error = xfs_btree_free_block(cur, rbp); if (error) goto error0; /* * If we joined with the left neighbor, set the buffer in the * cursor to the left block, and fix up the index. */ if (bp != lbp) { cur->bc_levels[level].bp = lbp; cur->bc_levels[level].ptr += lrecs; cur->bc_levels[level].ra = 0; } /* * If we joined with the right neighbor and there's a level above * us, increment the cursor at that level. */ else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || level + 1 < cur->bc_nlevels) { error = xfs_btree_increment(cur, level + 1, &i); if (error) goto error0; } /* * Readjust the ptr at this level if it's not a leaf, since it's * still pointing at the deletion point, which makes the cursor * inconsistent. If this makes the ptr 0, the caller fixes it up. * We can't use decrement because it would change the next level up. */ if (level > 0) cur->bc_levels[level].ptr--; /* * We combined blocks, so we have to update the parent keys if the * btree supports overlapped intervals. However, * bc_levels[level + 1].ptr points to the old block so that the caller * knows which record to delete. Therefore, the caller must be savvy * enough to call updkeys for us if we return stat == 2. The other * exit points from this function don't require deletions further up * the tree, so they can call updkeys directly. */ /* Return value means the next level up has something to do. */ *stat = 2; return 0; error0: if (tcur) xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } /* * Delete the record pointed to by cur. * The cursor refers to the place where the record was (could be inserted) * when the operation returns. */ int /* error */ xfs_btree_delete( struct xfs_btree_cur *cur, int *stat) /* success/failure */ { int error; /* error return value */ int level; int i; bool joined = false; /* * Go up the tree, starting at leaf level. * * If 2 is returned then a join was done; go to the next level. * Otherwise we are done. */ for (level = 0, i = 2; i == 2; level++) { error = xfs_btree_delrec(cur, level, &i); if (error) goto error0; if (i == 2) joined = true; } /* * If we combined blocks as part of deleting the record, delrec won't * have updated the parent high keys so we have to do that here. */ if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { error = xfs_btree_updkeys_force(cur, 0); if (error) goto error0; } if (i == 0) { for (level = 1; level < cur->bc_nlevels; level++) { if (cur->bc_levels[level].ptr == 0) { error = xfs_btree_decrement(cur, level, &i); if (error) goto error0; break; } } } *stat = i; return 0; error0: return error; } /* * Get the data from the pointed-to record. */ int /* error */ xfs_btree_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ union xfs_btree_rec **recp, /* output: btree record */ int *stat) /* output: success/failure */ { struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer pointer */ int ptr; /* record number */ #ifdef DEBUG int error; /* error return value */ #endif ptr = cur->bc_levels[0].ptr; block = xfs_btree_get_block(cur, 0, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, 0, bp); if (error) return error; #endif /* * Off the right end or left end, return failure. */ if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { *stat = 0; return 0; } /* * Point to the record and extract its data. */ *recp = xfs_btree_rec_addr(cur, ptr, block); *stat = 1; return 0; } /* Visit a block in a btree. */ STATIC int xfs_btree_visit_block( struct xfs_btree_cur *cur, int level, xfs_btree_visit_blocks_fn fn, void *data) { struct xfs_btree_block *block; struct xfs_buf *bp; union xfs_btree_ptr rptr, bufptr; int error; /* do right sibling readahead */ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); block = xfs_btree_get_block(cur, level, &bp); /* process the block */ error = fn(cur, level, data); if (error) return error; /* now read rh sibling block for next iteration */ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; /* * We only visit blocks once in this walk, so we have to avoid the * internal xfs_btree_lookup_get_block() optimisation where it will * return the same block without checking if the right sibling points * back to us and creates a cyclic reference in the btree. */ xfs_btree_buf_to_ptr(cur, bp, &bufptr); if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } /* Visit every block in a btree. */ int xfs_btree_visit_blocks( struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data) { union xfs_btree_ptr lptr; int level; struct xfs_btree_block *block = NULL; int error = 0; xfs_btree_init_ptr_from_cur(cur, &lptr); /* for each level */ for (level = cur->bc_nlevels - 1; level >= 0; level--) { /* grab the left hand block */ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); if (error) return error; /* readahead the left most block for the next level down */ if (level > 0) { union xfs_btree_ptr *ptr; ptr = xfs_btree_ptr_addr(cur, 1, block); xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); if (!(flags & XFS_BTREE_VISIT_LEAVES)) continue; } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) { continue; } /* for each buffer in the level */ do { error = xfs_btree_visit_block(cur, level, fn, data); } while (!error); if (error != -ENOENT) return error; } return 0; } /* * Change the owner of a btree. * * The mechanism we use here is ordered buffer logging. Because we don't know * how many buffers were are going to need to modify, we don't really want to * have to make transaction reservations for the worst case of every buffer in a * full size btree as that may be more space that we can fit in the log.... * * We do the btree walk in the most optimal manner possible - we have sibling * pointers so we can just walk all the blocks on each level from left to right * in a single pass, and then move to the next level and do the same. We can * also do readahead on the sibling pointers to get IO moving more quickly, * though for slow disks this is unlikely to make much difference to performance * as the amount of CPU work we have to do before moving to the next block is * relatively small. * * For each btree block that we load, modify the owner appropriately, set the * buffer as an ordered buffer and log it appropriately. We need to ensure that * we mark the region we change dirty so that if the buffer is relogged in * a subsequent transaction the changes we make here as an ordered buffer are * correctly relogged in that transaction. If we are in recovery context, then * just queue the modified buffer as delayed write buffer so the transaction * recovery completion writes the changes to disk. */ struct xfs_btree_block_change_owner_info { uint64_t new_owner; struct list_head *buffer_list; }; static int xfs_btree_block_change_owner( struct xfs_btree_cur *cur, int level, void *data) { struct xfs_btree_block_change_owner_info *bbcoi = data; struct xfs_btree_block *block; struct xfs_buf *bp; /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); } else { if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) return 0; block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); } /* * If the block is a root block hosted in an inode, we might not have a * buffer pointer here and we shouldn't attempt to log the change as the * information is already held in the inode and discarded when the root * block is formatted into the on-disk inode fork. We still change it, * though, so everything is consistent in memory. */ if (!bp) { ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(level == cur->bc_nlevels - 1); return 0; } if (cur->bc_tp) { if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { xfs_btree_log_block(cur, bp, XFS_BB_OWNER); return -EAGAIN; } } else { xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } return 0; } int xfs_btree_change_owner( struct xfs_btree_cur *cur, uint64_t new_owner, struct list_head *buffer_list) { struct xfs_btree_block_change_owner_info bbcoi; bbcoi.new_owner = new_owner; bbcoi.buffer_list = buffer_list; return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner, XFS_BTREE_VISIT_ALL, &bbcoi); } /* Verify the v5 fields of a long-format btree block. */ xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify( struct xfs_buf *bp, uint64_t owner) { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (owner != XFS_RMAP_OWN_UNKNOWN && be64_to_cpu(block->bb_u.l.bb_owner) != owner) return __this_address; return NULL; } /* Verify a long-format btree block. */ xfs_failaddr_t xfs_btree_fsblock_verify( struct xfs_buf *bp, unsigned int max_recs) { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); xfs_fsblock_t fsb; xfs_failaddr_t fa; ASSERT(!xfs_buftarg_is_mem(bp->b_target)); /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } /* Verify an in-memory btree block. */ xfs_failaddr_t xfs_btree_memblock_verify( struct xfs_buf *bp, unsigned int max_recs) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buftarg *btp = bp->b_target; xfs_failaddr_t fa; xfbno_t bno; ASSERT(xfs_buftarg_is_mem(bp->b_target)); /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); fa = xfs_btree_check_memblock_siblings(btp, bno, block->bb_u.l.bb_leftsib); if (fa) return fa; fa = xfs_btree_check_memblock_siblings(btp, bno, block->bb_u.l.bb_rightsib); if (fa) return fa; return NULL; } /** * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block * * @bp: buffer containing the btree block */ xfs_failaddr_t xfs_btree_agblock_v5hdr_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag)) return __this_address; return NULL; } /** * xfs_btree_agblock_verify() -- verify a short-format btree block * * @bp: buffer containing the btree block * @max_recs: maximum records allowed in this btree node */ xfs_failaddr_t xfs_btree_agblock_verify( struct xfs_buf *bp, unsigned int max_recs) { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); xfs_agblock_t agbno; xfs_failaddr_t fa; ASSERT(!xfs_buftarg_is_mem(bp->b_target)); /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_rightsib); return fa; } /* * For the given limits on leaf and keyptr records per block, calculate the * height of the tree needed to index the number of leaf records. */ unsigned int xfs_btree_compute_maxlevels( const unsigned int *limits, unsigned long long records) { unsigned long long level_blocks = howmany_64(records, limits[0]); unsigned int height = 1; while (level_blocks > 1) { level_blocks = howmany_64(level_blocks, limits[1]); height++; } return height; } /* * For the given limits on leaf and keyptr records per block, calculate the * number of blocks needed to index the given number of leaf records. */ unsigned long long xfs_btree_calc_size( const unsigned int *limits, unsigned long long records) { unsigned long long level_blocks = howmany_64(records, limits[0]); unsigned long long blocks = level_blocks; while (level_blocks > 1) { level_blocks = howmany_64(level_blocks, limits[1]); blocks += level_blocks; } return blocks; } /* * Given a number of available blocks for the btree to consume with records and * pointers, calculate the height of the tree needed to index all the records * that space can hold based on the number of pointers each interior node * holds. * * We start by assuming a single level tree consumes a single block, then track * the number of blocks each node level consumes until we no longer have space * to store the next node level. At this point, we are indexing all the leaf * blocks in the space, and there's no more free space to split the tree any * further. That's our maximum btree height. */ unsigned int xfs_btree_space_to_height( const unsigned int *limits, unsigned long long leaf_blocks) { /* * The root btree block can have fewer than minrecs pointers in it * because the tree might not be big enough to require that amount of * fanout. Hence it has a minimum size of 2 pointers, not limits[1]. */ unsigned long long node_blocks = 2; unsigned long long blocks_left = leaf_blocks - 1; unsigned int height = 1; if (leaf_blocks < 1) return 0; while (node_blocks < blocks_left) { blocks_left -= node_blocks; node_blocks *= limits[1]; height++; } return height; } /* * Query a regular btree for all records overlapping a given interval. * Start with a LE lookup of the key of low_rec and return all records * until we find a record with a key greater than the key of high_rec. */ STATIC int xfs_btree_simple_query_range( struct xfs_btree_cur *cur, const union xfs_btree_key *low_key, const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { union xfs_btree_rec *recp; union xfs_btree_key rec_key; int stat; bool firstrec = true; int error; ASSERT(cur->bc_ops->init_high_key_from_rec); ASSERT(cur->bc_ops->diff_two_keys); /* * Find the leftmost record. The btree cursor must be set * to the low record used to generate low_key. */ stat = 0; error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); if (error) goto out; /* Nothing? See if there's anything to the right. */ if (!stat) { error = xfs_btree_increment(cur, 0, &stat); if (error) goto out; } while (stat) { /* Find the record. */ error = xfs_btree_get_rec(cur, &recp, &stat); if (error || !stat) break; /* Skip if low_key > high_key(rec). */ if (firstrec) { cur->bc_ops->init_high_key_from_rec(&rec_key, recp); firstrec = false; if (xfs_btree_keycmp_gt(cur, low_key, &rec_key)) goto advloop; } /* Stop if low_key(rec) > high_key. */ cur->bc_ops->init_key_from_rec(&rec_key, recp); if (xfs_btree_keycmp_gt(cur, &rec_key, high_key)) break; /* Callback */ error = fn(cur, recp, priv); if (error) break; advloop: /* Move on to the next record. */ error = xfs_btree_increment(cur, 0, &stat); if (error) break; } out: return error; } /* * Query an overlapped interval btree for all records overlapping a given * interval. This function roughly follows the algorithm given in * "Interval Trees" of _Introduction to Algorithms_, which is section * 14.3 in the 2nd and 3rd editions. * * First, generate keys for the low and high records passed in. * * For any leaf node, generate the high and low keys for the record. * If the record keys overlap with the query low/high keys, pass the * record to the function iterator. * * For any internal node, compare the low and high keys of each * pointer against the query low/high keys. If there's an overlap, * follow the pointer. * * As an optimization, we stop scanning a block when we find a low key * that is greater than the query's high key. */ STATIC int xfs_btree_overlapped_query_range( struct xfs_btree_cur *cur, const union xfs_btree_key *low_key, const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { union xfs_btree_ptr ptr; union xfs_btree_ptr *pp; union xfs_btree_key rec_key; union xfs_btree_key rec_hkey; union xfs_btree_key *lkp; union xfs_btree_key *hkp; union xfs_btree_rec *recp; struct xfs_btree_block *block; int level; struct xfs_buf *bp; int i; int error; /* Load the root of the btree. */ level = cur->bc_nlevels - 1; xfs_btree_init_ptr_from_cur(cur, &ptr); error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); if (error) return error; xfs_btree_get_block(cur, level, &bp); trace_xfs_btree_overlapped_query_range(cur, level, bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto out; #endif cur->bc_levels[level].ptr = 1; while (level < cur->bc_nlevels) { block = xfs_btree_get_block(cur, level, &bp); /* End of node, pop back towards the root. */ if (cur->bc_levels[level].ptr > be16_to_cpu(block->bb_numrecs)) { pop_up: if (level < cur->bc_nlevels - 1) cur->bc_levels[level + 1].ptr++; level++; continue; } if (level == 0) { /* Handle a leaf node. */ recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block); cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); cur->bc_ops->init_key_from_rec(&rec_key, recp); /* * If (query's high key < record's low key), then there * are no more interesting records in this block. Pop * up to the leaf level to find more record blocks. * * If (record's high key >= query's low key) and * (query's high key >= record's low key), then * this record overlaps the query range; callback. */ if (xfs_btree_keycmp_lt(cur, high_key, &rec_key)) goto pop_up; if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) { error = fn(cur, recp, priv); if (error) break; } cur->bc_levels[level].ptr++; continue; } /* Handle an internal node. */ lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block); pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); /* * If (query's high key < pointer's low key), then there are no * more interesting keys in this block. Pop up one leaf level * to continue looking for records. * * If (pointer's high key >= query's low key) and * (query's high key >= pointer's low key), then * this record overlaps the query range; follow pointer. */ if (xfs_btree_keycmp_lt(cur, high_key, lkp)) goto pop_up; if (xfs_btree_keycmp_ge(cur, hkp, low_key)) { level--; error = xfs_btree_lookup_get_block(cur, level, pp, &block); if (error) goto out; xfs_btree_get_block(cur, level, &bp); trace_xfs_btree_overlapped_query_range(cur, level, bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto out; #endif cur->bc_levels[level].ptr = 1; continue; } cur->bc_levels[level].ptr++; } out: /* * If we don't end this function with the cursor pointing at a record * block, a subsequent non-error cursor deletion will not release * node-level buffers, causing a buffer leak. This is quite possible * with a zero-results range query, so release the buffers if we * failed to return any results. */ if (cur->bc_levels[0].bp == NULL) { for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_levels[i].bp) { xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); cur->bc_levels[i].bp = NULL; cur->bc_levels[i].ptr = 0; cur->bc_levels[i].ra = 0; } } } return error; } static inline void xfs_btree_key_from_irec( struct xfs_btree_cur *cur, union xfs_btree_key *key, const union xfs_btree_irec *irec) { union xfs_btree_rec rec; cur->bc_rec = *irec; cur->bc_ops->init_rec_from_cur(cur, &rec); cur->bc_ops->init_key_from_rec(key, &rec); } /* * Query a btree for all records overlapping a given interval of keys. The * supplied function will be called with each record found; return one of the * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error * code. This function returns -ECANCELED, zero, or a negative error code. */ int xfs_btree_query_range( struct xfs_btree_cur *cur, const union xfs_btree_irec *low_rec, const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv) { union xfs_btree_key low_key; union xfs_btree_key high_key; /* Find the keys of both ends of the interval. */ xfs_btree_key_from_irec(cur, &high_key, high_rec); xfs_btree_key_from_irec(cur, &low_key, low_rec); /* Enforce low key <= high key. */ if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) return -EINVAL; if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, fn, priv); } /* Query a btree for all records. */ int xfs_btree_query_all( struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv) { union xfs_btree_key low_key; union xfs_btree_key high_key; memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); memset(&low_key, 0, sizeof(low_key)); memset(&high_key, 0xFF, sizeof(high_key)); return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); } static int xfs_btree_count_blocks_helper( struct xfs_btree_cur *cur, int level, void *data) { xfs_filblks_t *blocks = data; (*blocks)++; return 0; } /* Count the blocks in a btree and return the result in *blocks. */ int xfs_btree_count_blocks( struct xfs_btree_cur *cur, xfs_filblks_t *blocks) { *blocks = 0; return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, XFS_BTREE_VISIT_ALL, blocks); } /* Compare two btree pointers. */ int64_t xfs_btree_diff_two_ptrs( struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } struct xfs_btree_has_records { /* Keys for the start and end of the range we want to know about. */ union xfs_btree_key start_key; union xfs_btree_key end_key; /* Mask for key comparisons, if desired. */ const union xfs_btree_key *key_mask; /* Highest record key we've seen so far. */ union xfs_btree_key high_key; enum xbtree_recpacking outcome; }; STATIC int xfs_btree_has_records_helper( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { union xfs_btree_key rec_key; union xfs_btree_key rec_high_key; struct xfs_btree_has_records *info = priv; enum xbtree_key_contig key_contig; cur->bc_ops->init_key_from_rec(&rec_key, rec); if (info->outcome == XBTREE_RECPACKING_EMPTY) { info->outcome = XBTREE_RECPACKING_SPARSE; /* * If the first record we find does not overlap the start key, * then there is a hole at the start of the search range. * Classify this as sparse and stop immediately. */ if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key, info->key_mask)) return -ECANCELED; } else { /* * If a subsequent record does not overlap with the any record * we've seen so far, there is a hole in the middle of the * search range. Classify this as sparse and stop. * If the keys overlap and this btree does not allow overlap, * signal corruption. */ key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, &rec_key, info->key_mask); if (key_contig == XBTREE_KEY_OVERLAP && !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return -EFSCORRUPTED; if (key_contig == XBTREE_KEY_GAP) return -ECANCELED; } /* * If high_key(rec) is larger than any other high key we've seen, * remember it for later. */ cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec); if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key, info->key_mask)) info->high_key = rec_high_key; /* struct copy */ return 0; } /* * Scan part of the keyspace of a btree and tell us if that keyspace does not * map to any records; is fully mapped to records; or is partially mapped to * records. This is the btree record equivalent to determining if a file is * sparse. * * For most btree types, the record scan should use all available btree key * fields to compare the keys encountered. These callers should pass NULL for * @mask. However, some callers (e.g. scanning physical space in the rmapbt) * want to ignore some part of the btree record keyspace when performing the * comparison. These callers should pass in a union xfs_btree_key object with * the fields that *should* be a part of the comparison set to any nonzero * value, and the rest zeroed. */ int xfs_btree_has_records( struct xfs_btree_cur *cur, const union xfs_btree_irec *low, const union xfs_btree_irec *high, const union xfs_btree_key *mask, enum xbtree_recpacking *outcome) { struct xfs_btree_has_records info = { .outcome = XBTREE_RECPACKING_EMPTY, .key_mask = mask, }; int error; /* Not all btrees support this operation. */ if (!cur->bc_ops->keys_contiguous) { ASSERT(0); return -EOPNOTSUPP; } xfs_btree_key_from_irec(cur, &info.start_key, low); xfs_btree_key_from_irec(cur, &info.end_key, high); error = xfs_btree_query_range(cur, low, high, xfs_btree_has_records_helper, &info); if (error == -ECANCELED) goto out; if (error) return error; if (info.outcome == XBTREE_RECPACKING_EMPTY) goto out; /* * If the largest high_key(rec) we saw during the walk is greater than * the end of the search range, classify this as full. Otherwise, * there is a hole at the end of the search range. */ if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key, mask)) info.outcome = XBTREE_RECPACKING_FULL; out: *outcome = info.outcome; return 0; } /* Are there more records in this btree? */ bool xfs_btree_has_more_records( struct xfs_btree_cur *cur) { struct xfs_btree_block *block; struct xfs_buf *bp; block = xfs_btree_get_block(cur, 0, &bp); /* There are still records in this block. */ if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) return true; /* There are more record blocks. */ if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); else return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); } /* Set up all the btree cursor caches. */ int __init xfs_btree_init_cur_caches(void) { int error; error = xfs_allocbt_init_cur_cache(); if (error) return error; error = xfs_inobt_init_cur_cache(); if (error) goto err; error = xfs_bmbt_init_cur_cache(); if (error) goto err; error = xfs_rmapbt_init_cur_cache(); if (error) goto err; error = xfs_refcountbt_init_cur_cache(); if (error) goto err; return 0; err: xfs_btree_destroy_cur_caches(); return error; } /* Destroy all the btree cursor caches, if they've been allocated. */ void xfs_btree_destroy_cur_caches(void) { xfs_allocbt_destroy_cur_cache(); xfs_inobt_destroy_cur_cache(); xfs_bmbt_destroy_cur_cache(); xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); } /* Move the btree cursor before the first record. */ int xfs_btree_goto_left_edge( struct xfs_btree_cur *cur) { int stat = 0; int error; memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); if (error) return error; if (!stat) return 0; error = xfs_btree_decrement(cur, 0, &stat); if (error) return error; if (stat != 0) { ASSERT(0); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } |
440 476 351 449 450 7 52 241 15 12 98 56 3 2 51 3 42 2 23 297 152 171 293 11 10 84 265 22 75 50 6 19 24 2 13 1 12 1 2 1 2 8 70 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 | // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST); } EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. * * Note that strictly speaking this conflicts with RFC 2460 4.0: * ...The contents and semantics of each extension header determine whether * or not to proceed to the next header. Therefore, extension headers must * be processed strictly in the order they appear in the packet; a * receiver must not, for example, scan through a packet looking for a * particular kind of extension header and process that header prior to * processing all preceding ones. * * We do exactly this. This is a protocol bug. We can't decide after a * seeing an unknown discard-with-error flavour TLV option if it's a * ICMP error message or not (errors should never be send in reply to * ICMP error messages). * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. * --AK * * This function parses (probably truncated) exthdr set "hdr". * "nexthdrp" initially points to some place, * where type of the first header can be found. * * It skips all well-known exthdrs, and returns pointer to the start * of unparsable area i.e. the first header with unknown type. * If it is not NULL *nexthdr is updated by type/protocol of this header. * * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. * - it may return pointer pointing beyond end of packet, * if the last recognized header is truncated in the middle. * - if packet is truncated, so that all parsed headers are skipped, * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. * - Reports the offset field of the final fragment header so it is * possible to tell whether this is a first fragment, later fragment, * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. * * --ANK (980726) */ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; *frag_offp = 0; while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -1; *frag_offp = *fp; if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr; start += hdrlen; } *nexthdrp = nexthdr; return start; } EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) goto bad; offset += 2; len -= 2; while (len > 0) { int opttype = nh[offset]; int optlen; if (opttype == type) return offset; switch (opttype) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; } offset += optlen; len -= optlen; } /* not_found */ bad: return -1; } EXPORT_SYMBOL_GPL(ipv6_find_tlv); /* * find the offset to specified header or the protocol number of last header * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * * If target header is found, its offset is set in *offset and return protocol * number. Otherwise, return -1. * * If the first fragment doesn't contain the final protocol header or * NEXTHDR_NONE it is considered invalid. * * Note that non-1st fragment is special case that "the protocol number * of last header" is "next header" field in Fragment header. In this case, * *offset is meaningless and fragment offset is stored in *fragoff if fragoff * isn't NULL. * * if flags is not NULL and it's a fragment, then the frag flag * IP6_FH_F_FRAG will be set. If it's an AH header, the * IP6_FH_F_AUTH flag is set and target < 0, then this function will * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this * function will skip all those routing headers, where segements_left was 0. */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *flags) { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; bool found; if (fragoff) *fragoff = 0; if (*offset) { struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); if (!ip6 || (ip6->version != 6)) return -EBADMSG; start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } do { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { if (target < 0 || found) break; return -ENOENT; } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { struct ipv6_rt_hdr _rh, *rh; rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && rh->segments_left == 0) found = false; } if (nexthdr == NEXTHDR_FRAGMENT) { unsigned short _frag_off; __be16 *fp; if (flags) /* Indicate that this is a fragment */ *flags |= IP6_FH_F_FRAG; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; if (_frag_off) { if (target < 0 && ((!ipv6_ext_hdr(hp->nexthdr)) || hp->nexthdr == NEXTHDR_NONE)) { if (fragoff) *fragoff = _frag_off; return hp->nexthdr; } if (!found) return -ENOENT; if (fragoff) *fragoff = _frag_off; break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) break; hdrlen = ipv6_authlen(hp); } else hdrlen = ipv6_optlen(hp); if (!found) { nexthdr = hp->nexthdr; start += hdrlen; } } while (!found); *offset = start; return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr); |
12 4 8 10 1 10 10 10 14 3 3 4 11 6 4 4 4 4 4 4 4 4 4 4 4 4 4 11 9 11 11 3 5 6 3 6 919 918 3 9 6 6 6 5 1 4 5 8 2 4 3 152 1 4 2 4 143 3 4 3 12 12 2 12 6 8 12 4 4 4 4 4 261 257 6 81 83 6 1 10 4 6 1 3 4 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 | // SPDX-License-Identifier: GPL-2.0-only /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork() * * Copyright (C) 2008-2009 Red Hat, Inc. * Authors: * Izik Eidus * Andrea Arcangeli * Chris Wright * Hugh Dickins */ #include <linux/errno.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/cputime.h> #include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/spinlock.h> #include <linux/xxhash.h> #include <linux/delay.h> #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hashtable.h> #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" #include "mm_slot.h" #define CREATE_TRACE_POINTS #include <trace/events/ksm.h> #ifdef CONFIG_NUMA #define NUMA(x) (x) #define DO_NUMA(x) do { (x); } while (0) #else #define NUMA(x) (0) #define DO_NUMA(x) do { } while (0) #endif typedef u8 rmap_age_t; /** * DOC: Overview * * A few notes about the KSM scanning process, * to make it easier to understand the data structures below: * * In order to reduce excessive scanning, KSM sorts the memory pages by their * contents into a data structure that holds pointers to the pages' locations. * * Since the contents of the pages may change at any moment, KSM cannot just * insert the pages into a normal sorted tree and expect it to find anything. * Therefore KSM uses two data structures - the stable and the unstable tree. * * The stable tree holds pointers to all the merged pages (ksm pages), sorted * by their contents. Because each such page is write-protected, searching on * this tree is fully assured to be working (except when pages are unmapped), * and therefore this tree is called the stable tree. * * The stable tree node includes information required for reverse * mapping from a KSM page to virtual addresses that map this page. * * In order to avoid large latencies of the rmap walks on KSM pages, * KSM maintains two types of nodes in the stable tree: * * * the regular nodes that keep the reverse mapping structures in a * linked list * * the "chains" that link nodes ("dups") that represent the same * write protected memory content, but each "dup" corresponds to a * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to * be "unchanged for a period of time". The unstable tree sorts these pages * by their contents, but since they are not write-protected, KSM cannot rely * upon the unstable tree to work correctly - the unstable tree is liable to * be corrupted as its contents are modified, and so it is called unstable. * * KSM solves this problem by several techniques: * * 1) The unstable tree is flushed every time KSM completes scanning all * memory areas, and then the tree is rebuilt again from the beginning. * 2) KSM will only insert into the unstable tree, pages whose hash value * has not changed since the previous scan of all memory areas. * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the * colors of the nodes and not on their contents, assuring that even when * the tree gets "corrupted" it won't get out of balance, so scanning time * remains the same (also, searching and inserting nodes in an rbtree uses * the same algorithm, so we have no overhead when we flush and rebuild). * 4) KSM never flushes the stable tree, which means that even if it were to * take 10 attempts to find a page in the unstable tree, once it is found, * it is secured in the stable tree. (When we scan a new page, we first * compare it against the stable tree, and then against the unstable tree.) * * If the merge_across_nodes tunable is unset, then KSM maintains multiple * stable trees and multiple unstable trees: one of each for each NUMA node. */ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items */ struct ksm_mm_slot { struct mm_slot slot; struct ksm_rmap_item *rmap_list; }; /** * struct ksm_scan - cursor for scanning * @mm_slot: the current mm_slot we are scanning * @address: the next address inside that to be scanned * @rmap_list: link to the next rmap to be scanned in the rmap_list * @seqnr: count of completed full scans (needed when removing unstable node) * * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { struct ksm_mm_slot *mm_slot; unsigned long address; struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; struct { struct hlist_node hlist_dup; struct list_head list; }; }; }; struct hlist_head hlist; union { unsigned long kpfn; unsigned long chain_prune_time; }; /* * STABLE_NODE_CHAIN can be any negative number in * rmap_hlist_len negative range, but better not -1 to be able * to reliably detect underflows. */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif }; /** * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node * @age: number of scan iterations since creation * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA int nid; /* when node of unstable tree */ #endif }; struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ rmap_age_t age; rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ struct ksm_stable_node *head; struct hlist_node hlist; }; }; }; #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; static struct rb_root one_unstable_tree[1] = { RB_ROOT }; static struct rb_root *root_stable_tree = one_stable_tree; static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); #define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, }; static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; /* Default number of pages to scan per batch */ #define DEFAULT_PAGES_TO_SCAN 100 /* The number of pages scanned */ static unsigned long ksm_pages_scanned; /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; /* The number of page slots additionally sharing those nodes */ static unsigned long ksm_pages_sharing; /* The number of nodes in the unstable tree */ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; /* The number of stable_node chains */ static unsigned long ksm_stable_node_chains; /* The number of stable_node dups linked to the stable_node chains */ static unsigned long ksm_stable_node_dups; /* Delay in pruning stale stable_node_dups in the stable_node_chains */ static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; /* Maximum number of page slots sharing a stable node */ static int ksm_max_page_sharing = 256; /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; /* Checksum of an empty (zeroed) page */ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; /* Skip pages that couldn't be de-duplicated previously */ /* Default to true at least temporarily, for testing */ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; /* Don't scan more than max pages per batch. */ static unsigned long ksm_advisor_max_pages_to_scan = 30000; /* Min CPU for scanning pages per scan */ #define KSM_ADVISOR_MIN_CPU 10 /* Max CPU for scanning pages per scan */ static unsigned int ksm_advisor_max_cpu = 70; /* Target scan time in seconds to analyze all KSM candidate pages. */ static unsigned long ksm_advisor_target_scan_time = 200; /* Exponentially weighted moving average. */ #define EWMA_WEIGHT 30 /** * struct advisor_ctx - metadata for KSM advisor * @start_scan: start time of the current scan * @scan_time: scan time of previous scan * @change: change in percent to pages_to_scan parameter * @cpu_time: cpu time consumed by the ksmd thread in the previous scan */ struct advisor_ctx { ktime_t start_scan; unsigned long scan_time; unsigned long change; unsigned long long cpu_time; }; static struct advisor_ctx advisor_ctx; /* Define different advisor's */ enum ksm_advisor_type { KSM_ADVISOR_NONE, KSM_ADVISOR_SCAN_TIME, }; static enum ksm_advisor_type ksm_advisor; #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ /* At least scan this many pages per batch. */ static unsigned long ksm_advisor_min_pages_to_scan = 500; static void set_advisor_defaults(void) { if (ksm_advisor == KSM_ADVISOR_NONE) { ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN; } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) { advisor_ctx = (const struct advisor_ctx){ 0 }; ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan; } } #endif /* CONFIG_SYSFS */ static inline void advisor_start_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) advisor_ctx.start_scan = ktime_get(); } /* * Use previous scan time if available, otherwise use current scan time as an * approximation for the previous scan time. */ static inline unsigned long prev_scan_time(struct advisor_ctx *ctx, unsigned long scan_time) { return ctx->scan_time ? ctx->scan_time : scan_time; } /* Calculate exponential weighted moving average */ static unsigned long ewma(unsigned long prev, unsigned long curr) { return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100; } /* * The scan time advisor is based on the current scan rate and the target * scan rate. * * new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time) * * To avoid perturbations it calculates a change factor of previous changes. * A new change factor is calculated for each iteration and it uses an * exponentially weighted moving average. The new pages_to_scan value is * multiplied with that change factor: * * new_pages_to_scan *= change facor * * The new_pages_to_scan value is limited by the cpu min and max values. It * calculates the cpu percent for the last scan and calculates the new * estimated cpu percent cost for the next scan. That value is capped by the * cpu min and max setting. * * In addition the new pages_to_scan value is capped by the max and min * limits. */ static void scan_time_advisor(void) { unsigned int cpu_percent; unsigned long cpu_time; unsigned long cpu_time_diff; unsigned long cpu_time_diff_ms; unsigned long pages; unsigned long per_page_cost; unsigned long factor; unsigned long change; unsigned long last_scan_time; unsigned long scan_time; /* Convert scan time to seconds */ scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan), MSEC_PER_SEC); scan_time = scan_time ? scan_time : 1; /* Calculate CPU consumption of ksmd background thread */ cpu_time = task_sched_runtime(current); cpu_time_diff = cpu_time - advisor_ctx.cpu_time; cpu_time_diff_ms = cpu_time_diff / 1000 / 1000; cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000); cpu_percent = cpu_percent ? cpu_percent : 1; last_scan_time = prev_scan_time(&advisor_ctx, scan_time); /* Calculate scan time as percentage of target scan time */ factor = ksm_advisor_target_scan_time * 100 / scan_time; factor = factor ? factor : 1; /* * Calculate scan time as percentage of last scan time and use * exponentially weighted average to smooth it */ change = scan_time * 100 / last_scan_time; change = change ? change : 1; change = ewma(advisor_ctx.change, change); /* Calculate new scan rate based on target scan rate. */ pages = ksm_thread_pages_to_scan * 100 / factor; /* Update pages_to_scan by weighted change percentage. */ pages = pages * change / 100; /* Cap new pages_to_scan value */ per_page_cost = ksm_thread_pages_to_scan / cpu_percent; per_page_cost = per_page_cost ? per_page_cost : 1; pages = min(pages, per_page_cost * ksm_advisor_max_cpu); pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU); pages = min(pages, ksm_advisor_max_pages_to_scan); /* Update advisor context */ advisor_ctx.change = change; advisor_ctx.scan_time = scan_time; advisor_ctx.cpu_time = cpu_time; ksm_thread_pages_to_scan = pages; trace_ksm_advisor(scan_time, pages, cpu_percent); } static void advisor_stop_scan(void) { if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) scan_time_advisor(); } #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; static int ksm_nr_node_ids = 1; #else #define ksm_merge_across_nodes 1U #define ksm_nr_node_ids 1 #endif #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 static unsigned long ksm_run = KSM_RUN_STOP; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); static int __init ksm_slab_init(void) { rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; stable_node_cache = KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; return 0; out_free2: kmem_cache_destroy(stable_node_cache); out_free1: kmem_cache_destroy(rmap_item_cache); out: return -ENOMEM; } static void __init ksm_slab_free(void) { kmem_cache_destroy(mm_slot_cache); kmem_cache_destroy(stable_node_cache); kmem_cache_destroy(rmap_item_cache); mm_slot_cache = NULL; } static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; VM_BUG_ON(!is_stable_node_chain(chain)); hlist_add_head(&dup->hlist_dup, &chain->hlist); ksm_stable_node_dups++; } static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) __stable_node_dup_del(dup); else rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); #ifdef CONFIG_DEBUG_VM dup->head = NULL; #endif } static inline struct ksm_rmap_item *alloc_rmap_item(void) { struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; } static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under * pressure, which may lead to hung task warnings. Adding __GFP_HIGH * grants access to memory reserves, helping to avoid this problem. */ return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, * takes mmap_lock briefly to serialize against them. ksm_exit() does not set * a special flag: they can just back out as soon as mm_users goes to zero. * ksm_test_exit() is used throughout to make this test for exit: in some * places for correctness, in some places just to avoid unnecessary work. */ static inline bool ksm_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. * * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; if (lock_vma) vma_start_write(vma); do { bool ksm_page = false; struct folio_walk fw; struct folio *folio; cond_resched(); folio = folio_walk_start(&fw, vma, addr, FW_MIGRATION | FW_ZEROPAGE); if (folio) { /* Small folio implies FW_LEVEL_PTE. */ if (!folio_test_large(folio) && (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte))) ksm_page = true; folio_walk_end(&fw, vma); } if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* * We must loop until we no longer find a KSM page because * handle_mm_fault() may back out if there's any difficulty e.g. if * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's * okay, that truncation will have unmapped the KSM page for us. * * VM_FAULT_OOM: at the time of writing (late July 2009), setting * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the * current task has TIF_MEMDIE set, and will be OOM killed on return * to user; and ksmd, having no mm, would never be chosen for that. * * But if the mm is in a limited mem_cgroup, then the fault may fail * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and * even ksmd can fail in this way - though it's usually breaking ksm * just to undo a merge it made a moment before, so unlikely to oom. * * That's a pity: we might therefore have more kernel pages allocated * than we're counting as nodes in the stable tree; but ksm_do_scan * will retry to break_cow on each pass, so should recover the page * in due course. The important thing is to not let VM_MERGEABLE * be cleared while any such pages might remain in the area. */ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; } static bool vma_ksm_compatible(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_HUGETLB | VM_MIXEDMAP| VM_DROPPABLE)) return false; /* just ignore the advice */ if (vma_is_dax(vma)) return false; #ifdef VM_SAO if (vma->vm_flags & VM_SAO) return false; #endif #ifdef VM_SPARC_ADI if (vma->vm_flags & VM_SPARC_ADI) return false; #endif return true; } static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; if (ksm_test_exit(mm)) return NULL; vma = vma_lookup(mm, addr); if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) return NULL; return vma; } static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; /* * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ put_anon_vma(rmap_item->anon_vma); mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) break_ksm(vma, addr, false); mmap_read_unlock(mm); } static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; struct vm_area_struct *vma; struct page *page = NULL; struct folio_walk fw; struct folio *folio; mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (!vma) goto out; folio = folio_walk_start(&fw, vma, addr, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); page = fw.page; } folio_walk_end(&fw, vma); } out: if (page) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } mmap_read_unlock(mm); return page; } /* * This helper is used for getting right index into array of tree roots. * When merge_across_nodes knob is set to 1, there are only two rb-trees for * stable and unstable pages from all nodes with roots in index 0. Otherwise, * every node has its own stable and unstable tree. */ static inline int get_kpfn_nid(unsigned long kpfn) { return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); chain->chain_prune_time = jiffies; chain->rmap_hlist_len = STABLE_NODE_CHAIN; #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) chain->nid = NUMA_NO_NODE; /* debug */ #endif ksm_stable_node_chains++; /* * Put the stable node chain in the first dimension of * the stable tree and at the same time remove the old * stable node. */ rb_replace_node(&dup->node, &chain->node, root); /* * Move the old stable node to the second dimension * queued in the hlist_dup. The invariant is that all * dup stable_nodes in the chain->hlist point to pages * that are write protected and have the exact same * content. */ stable_node_chain_add_dup(dup, chain); } return chain; } static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); free_stable_node(chain); ksm_stable_node_chains--; } static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) { ksm_pages_sharing--; trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm); } else { ksm_pages_shared--; } rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } /* * We need the second aligned pointer of the migrate_nodes * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); trace_ksm_remove_ksm_page(stable_node->kpfn); if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else stable_node_dup_del(stable_node); free_stable_node(stable_node); } enum ksm_get_folio_flags { KSM_GET_FOLIO_NOLOCK, KSM_GET_FOLIO_LOCK, KSM_GET_FOLIO_TRYLOCK }; /* * ksm_get_folio: checks if the page indicated by the stable node * is still its ksm page, despite having held no reference to it. * In which case we can trust the content of the page, and it * returns the gotten page; but if the page has now been zapped, * remove the stale node from the stable tree and return NULL. * But beware, the stable node's page might be being migrated. * * You would expect the stable_node to hold a reference to the ksm page. * But if it increments the page's count, swapping out has to wait for * ksmd to come around again before it can free the page, which may take * seconds or even minutes: much too unresponsive. So instead we use a * "keyhole reference": access to the ksm page from the stable node peeps * out through its keyhole to see if that page still holds the right key, * pointing back to this stable node. This relies on freeing a PageAnon * page to reset its page->mapping to NULL, and relies on no other use of * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, enum ksm_get_folio_flags flags) { struct folio *folio; void *expected_mapping; unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); if (READ_ONCE(folio->mapping) != expected_mapping) goto stale; /* * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!folio_try_get(folio)) { /* * Another check for folio->mapping != expected_mapping * would work here too. We have chosen to test the * swapcache flag to optimize the common case, when the * folio is or is about to be freed: the swapcache flag * is cleared (under spin_lock_irq) in the ref_freeze * section of __remove_mapping(); but anon folio->mapping * is reset to NULL later, in free_pages_prepare(). */ if (!folio_test_swapcache(folio)) goto stale; cpu_relax(); } if (READ_ONCE(folio->mapping) != expected_mapping) { folio_put(folio); goto stale; } if (flags == KSM_GET_FOLIO_TRYLOCK) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EBUSY); } } else if (flags == KSM_GET_FOLIO_LOCK) folio_lock(folio); if (flags != KSM_GET_FOLIO_NOLOCK) { if (READ_ONCE(folio->mapping) != expected_mapping) { folio_unlock(folio); folio_put(folio); goto stale; } } return folio; stale: /* * We come here from above when folio->mapping or the swapcache flag * suggests that the node is stale; but it might be under migration. * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), * before checking whether node->kpfn has been changed. */ smp_rmb(); if (READ_ONCE(stable_node->kpfn) != kpfn) goto again; remove_node_from_stable_tree(stable_node); return NULL; } /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { struct ksm_stable_node *stable_node; struct folio *folio; stable_node = rmap_item->head; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) goto out; hlist_del(&rmap_item->hlist); folio_unlock(folio); folio_put(folio); if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; rmap_item->mm->ksm_merging_pages--; VM_BUG_ON(stable_node->rmap_hlist_len <= 0); stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { unsigned char age; /* * Usually ksmd can and must skip the rb_erase, because * root_unstable_tree was already reset to RB_ROOT. * But be careful when an mm is exiting: do the rb_erase * if this rmap_item was inserted by this scan, rather * than left over from before. */ age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); BUG_ON(age > 1); if (!age) rb_erase(&rmap_item->node, root_unstable_tree + NUMA(rmap_item->nid)); ksm_pages_unshared--; rmap_item->address &= PAGE_MASK; } out: cond_resched(); /* we're called from many long loops */ } static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } } /* * Though it's very tempting to unmerge rmap_items from stable tree rather * than check every pte of a given vma, the locking doesn't quite work for * that - an rmap_item is assigned to the stable tree after inserting ksm * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing * rmap_items from parent to child at fork time (so as not to waste time * if exit comes before the next scan reaches it). * * Similarly, although we'd like to remove rmap_items (so updating counts * and freeing memory) when unmerging an area, it's easier to leave that * to the next pass of ksmd - consider, for example, how ksmd might be * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; for (addr = start; addr < end && !err; addr += PAGE_SIZE) { if (ksm_test_exit(vma->vm_mm)) break; if (signal_pending(current)) err = -ERESTARTSYS; else err = break_ksm(vma, addr, lock_vma); } return err; } static inline struct ksm_stable_node *folio_stable_node(const struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); } #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: */ static int remove_stable_node(struct ksm_stable_node *stable_node) { struct folio *folio; int err; folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK); if (!folio) { /* * ksm_get_folio did remove_node_from_stable_tree itself. */ return 0; } /* * Page could be still mapped if this races with __mmput() running in * between ksm_exit() and exit_mmap(). Just refuse to let * merge_across_nodes/max_page_sharing be switched. */ err = -EBUSY; if (!folio_mapped(folio)) { /* * The stable node did not yet appear stale to ksm_get_folio(), * since that allows for an unmapped ksm folio to be recognized * right up until it is freed; but the node is safe to remove. * This folio might be in an LRU cache waiting to be freed, * or it might be in the swapcache (perhaps under writeback), * or it might have been removed from swapcache a moment ago. */ folio_set_stable_node(folio, NULL); remove_node_from_stable_tree(stable_node); err = 0; } folio_unlock(folio); folio_put(folio); return err; } static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); if (remove_stable_node(stable_node)) return true; else return false; } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); if (remove_stable_node(dup)) return true; } BUG_ON(!hlist_empty(&stable_node->hlist)); free_stable_node_chain(stable_node, root); return false; } static int remove_all_stable_nodes(void) { struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); } return err; } static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); slot = list_entry(ksm_mm_head.slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); mm = mm_slot->slot.mm; mmap_read_lock(mm); /* * Exit right away if mm is exiting to avoid lockdep issue in * the maple tree */ if (ksm_test_exit(mm)) goto mm_exiting; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, false); if (err) goto error; } mm_exiting: remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); } /* Clean up stable nodes, but don't worry if some are still busy */ remove_all_stable_nodes(); ksm_scan.seqnr = 0; return 0; error: mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = &ksm_mm_head; spin_unlock(&ksm_mmlist_lock); return err; } #endif /* CONFIG_SYSFS */ static u32 calc_checksum(struct page *page) { u32 checksum; void *addr = kmap_local_page(page); checksum = xxhash(addr, PAGE_SIZE, 0); kunmap_local(addr); return checksum; } static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, pte_t *orig_pte) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0); int swapped; int err = -EFAULT; struct mmu_notifier_range range; bool anon_exclusive; pte_t entry; if (WARN_ON_ONCE(folio_test_large(folio))) return err; pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); if (pvmw.address == -EFAULT) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); if (!page_vma_mapped_walk(&pvmw)) goto out_mn; if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; anon_exclusive = PageAnonExclusive(&folio->page); entry = ptep_get(pvmw.pte); if (pte_write(entry) || pte_dirty(entry) || anon_exclusive || mm_tlb_flush_pending(mm)) { swapped = folio_test_swapcache(folio); flush_cache_page(vma, pvmw.address, folio_pfn(folio)); /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. * * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, &folio->page)) { set_pte_at(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) folio_mark_dirty(folio); entry = pte_mkclean(entry); if (pte_write(entry)) entry = pte_wrprotect(entry); set_pte_at(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = entry; err = 0; out_unlock: page_vma_mapped_walk_done(&pvmw); out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /** * replace_page - replace page in vma by new ksm page * @vma: vma that holds the pte pointing to page * @page: the page we are replacing by kpage * @kpage: the ksm page we replace page by * @orig_pte: the original value of the pte * * Returns 0 on success, -EFAULT on failure. */ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; struct folio *folio = page_folio(page); pmd_t *pmd; pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; unsigned long addr; int err = -EFAULT; struct mmu_notifier_range range; addr = page_address_in_vma(folio, page, vma); if (addr == -EFAULT) goto out; pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; /* * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ pmde = pmdp_get_lockless(pmd); if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); if (!ptep) goto out_mn; if (!pte_same(ptep_get(ptep), orig_pte)) { pte_unmap_unlock(ptep, ptl); goto out_mn; } VM_BUG_ON_PAGE(PageAnonExclusive(page), page); VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage), kfolio); /* * No need to check ksm_use_zero_pages here: we can only have a * zero_page here if ksm_use_zero_pages was enabled already. */ if (!is_zero_pfn(page_to_pfn(kpage))) { folio_get(kfolio); folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { /* * Use pte_mkdirty to mark the zero page mapped by KSM, and then * we can easily track all KSM-placed zero pages by checking if * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we * will get wrong values in /proc, and a BUG message in dmesg * when tearing down the mm. */ dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); /* * No need to notify as we are replacing a read only page with another * read only page with the same content. * * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; out_mn: mmu_notifier_invalidate_range_end(&range); out: return err; } /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage * @kpage: the KSM page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct folio *folio = page_folio(page); pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; if (!folio_test_anon(folio)) goto out; /* * We need the folio lock to read a stable swapcache flag in * write_protect_page(). We trylock because we don't want to wait * here - we prefer to continue scanning and merging different * pages, then come back to this page when it is unlocked. */ if (!folio_trylock(folio)) goto out; if (folio_test_large(folio)) { if (split_huge_page(page)) goto out_unlock; folio = page_folio(page); } /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ if (write_protect_page(vma, folio, &orig_pte) == 0) { if (!kpage) { /* * While we hold folio lock, upgrade folio from * anon to a NULL stable_node with the KSM flag set: * stable_tree_insert() will update stable_node. */ folio_set_stable_node(folio, NULL); folio_mark_accessed(folio); /* * Page reclaim just frees a clean folio with no dirty * ptes: make sure that the ksm page would be swapped. */ if (!folio_test_dirty(folio)) folio_mark_dirty(folio); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); } out_unlock: folio_unlock(folio); out: return err; } /* * This function returns 0 if the pages were merged or if they are * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise. */ static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item, struct page *page) { struct mm_struct *mm = rmap_item->mm; int err = -EFAULT; /* * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) { struct vm_area_struct *vma; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (vma) { err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); trace_ksm_merge_one_page( page_to_pfn(ZERO_PAGE(rmap_item->address)), rmap_item, mm, err); } else { /* * If the vma is out of date, we do not need to * continue. */ err = 0; } mmap_read_unlock(mm); } return err; } /* * try_to_merge_with_ksm_page - like try_to_merge_two_pages, * but no new kernel page is allocated: kpage must already be a ksm page. * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; struct vm_area_struct *vma; int err = -EFAULT; mmap_read_lock(mm); vma = find_mergeable_vma(mm, rmap_item->address); if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); if (err) goto out; /* Unstable nid is in union with stable anon_vma: remove first */ remove_rmap_item_from_tree(rmap_item); /* Must get reference to anon_vma while still holding mmap_lock */ rmap_item->anon_vma = vma->anon_vma; get_anon_vma(vma->anon_vma); out: mmap_read_unlock(mm); trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), rmap_item, mm, err); return err; } /* * try_to_merge_two_pages - take two identical pages and prepare them * to be merged into one page. * * This function returns the kpage if we successfully merged two identical * pages into one ksm page, NULL otherwise. * * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; err = try_to_merge_with_ksm_page(rmap_item, page, NULL); if (!err) { err = try_to_merge_with_ksm_page(tree_rmap_item, tree_page, page); /* * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ if (err) break_cow(rmap_item); } return err ? NULL : page_folio(page); } static __always_inline bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* * Check that at least one mapping still exists, otherwise * there's no much point to merge and share with this * stable_node, as the underlying tree_page of the other * sharer is going to be freed soon. */ return stable_node->rmap_hlist_len && stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; } static __always_inline bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct folio *folio, *tree_folio = NULL; int found_rmap_hlist_len; if (!prune_stale_stable_nodes || time_before(jiffies, stable_node->chain_prune_time + msecs_to_jiffies( ksm_stable_node_chains_prune_millisecs))) prune_stale_stable_nodes = false; else stable_node->chain_prune_time = jiffies; hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { cond_resched(); /* * We must walk all stable_node_dup to prune the stale * stable nodes during lookup. * * ksm_get_folio can drop the nodes from the * stable_node->hlist if they point to freed pages * (that's why we do a _safe walk). The "dup" * stable_node parameter itself will be freed from * under us if it returns NULL. */ folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK); if (!folio) continue; /* Pick the best candidate if possible. */ if (!found || (is_page_sharing_candidate(dup) && (!is_page_sharing_candidate(found) || dup->rmap_hlist_len > found_rmap_hlist_len))) { if (found) folio_put(tree_folio); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; tree_folio = folio; /* skip put_page for found candidate */ if (!prune_stale_stable_nodes && is_page_sharing_candidate(found)) break; continue; } folio_put(folio); } if (found) { if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) { /* * If there's not just one entry it would * corrupt memory, better BUG_ON. In KSM * context with no lock held it's not even * fatal. */ BUG_ON(stable_node->hlist.first->next); /* * There's just one entry and it is below the * deduplication limit so drop the chain. */ rb_replace_node(&stable_node->node, &found->node, root); free_stable_node(stable_node); ksm_stable_node_chains--; ksm_stable_node_dups--; /* * NOTE: the caller depends on the stable_node * to be equal to stable_node_dup if the chain * was collapsed. */ *_stable_node = found; /* * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. */ stable_node = NULL; } else if (stable_node->hlist.first != &found->hlist_dup && __is_page_sharing_candidate(found, 1)) { /* * If the found stable_node dup can accept one * more future merge (in addition to the one * that is underway) and is not at the head of * the chain, put it there so next search will * be quicker in the !prune_stale_stable_nodes * case. * * NOTE: it would be inaccurate to use nr > 1 * instead of checking the hlist.first pointer * directly, because in the * prune_stale_stable_nodes case "nr" isn't * the position of the found dup in the chain, * but the total number of dups in the chain. */ hlist_del(&found->hlist_dup); hlist_add_head(&found->hlist_dup, &stable_node->hlist); } } else { /* Its hlist must be empty if no one found. */ free_stable_node_chain(stable_node, root); } *_stable_node_dup = found; return tree_folio; } /* * Like for ksm_get_folio, this function can free the *_stable_node and * *_stable_node_dup if the returned tree_page is NULL. * * It can also free and overwrite *_stable_node with the found * stable_node_dup if the chain is collapsed (in which case * *_stable_node will be equal to *_stable_node_dup like if the chain * never existed). It's up to the caller to verify tree_page is not * NULL before dereferencing *_stable_node or *_stable_node_dup. * * *_stable_node_dup is really a second output parameter of this * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { *_stable_node_dup = stable_node; return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); } return stable_node_dup(_stable_node_dup, _stable_node, root, prune_stale_stable_nodes); } static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, false); } /* * stable_tree_search - search for page inside the stable tree * * This function checks if there is a page inside the stable tree * with identical content to the page that we are scanning right now. * * This function returns the stable tree node of identical content if found, * -EBUSY if the stable node's page is being migrated, NULL otherwise. */ static struct folio *stable_tree_search(struct page *page) { int nid; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; struct ksm_stable_node *page_node; struct folio *folio; folio = page_folio(page); page_node = folio_stable_node(folio); if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ folio_get(folio); return folio; } nid = get_kpfn_nid(folio_pfn(folio)); root = root_stable_tree + nid; again: new = &root->rb_node; parent = NULL; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain_prune(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); /* * If the mapcount of our migrated KSM folio is * at most 1, we can merge it with another * KSM folio where we know that we have space * for one more mapping without exceeding the * ksm_max_page_sharing limit: see * chain_prune(). This way, we can avoid adding * this stable node to the chain. */ if (folio_mapcount(folio) > 1) goto chain_append; } if (!is_page_sharing_candidate(stable_node_dup)) { /* * If the stable_node is a chain and * we got a payload match in memcmp * but we cannot merge the scanned * page in any of the existing * stable_node dups because they're * all full, we need to wait the * scanned page to find itself a match * in the unstable tree to create a * brand new KSM page to add later to * the dups of this stable_node. */ return NULL; } /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page * migration is sure to notice its raised count. * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ tree_folio = ksm_get_folio(stable_node_dup, KSM_GET_FOLIO_TRYLOCK); if (PTR_ERR(tree_folio) == -EBUSY) return ERR_PTR(-EBUSY); if (unlikely(!tree_folio)) /* * The tree may have been rebalanced, * so re-evaluate parent and new. */ goto again; folio_unlock(tree_folio); if (get_kpfn_nid(stable_node_dup->kpfn) != NUMA(stable_node_dup->nid)) { folio_put(tree_folio); goto replace; } return tree_folio; } } if (!page_node) return NULL; list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); out: if (is_page_sharing_candidate(page_node)) { folio_get(folio); return folio; } else return NULL; replace: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); rb_replace_node(&stable_node_dup->node, &page_node->node, root); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { rb_erase(&stable_node_dup->node, root); folio = NULL; } } else { VM_BUG_ON(!is_stable_node_chain(stable_node)); __stable_node_dup_del(stable_node_dup); if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); if (is_page_sharing_candidate(page_node)) folio_get(folio); else folio = NULL; } else { folio = NULL; } } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); return folio; chain_append: /* * If stable_node was a chain and chain_prune collapsed it, * stable_node has been updated to be the new regular * stable_node. A collapse of the chain is indistinguishable * from the case there was no chain in the stable * rbtree. Otherwise stable_node is the chain and * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, root); if (!stable_node) return NULL; } /* * Add this stable_node dup that was * migrated to the stable_node chain * of the current nid for this page * content. */ VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); stable_node_chain_add_dup(page_node, stable_node); goto out; } /* * stable_tree_insert - insert stable tree node pointing to new ksm page * into the stable tree. * * This function returns the stable tree node just allocated on success, * NULL otherwise. */ static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; struct ksm_stable_node *stable_node, *stable_node_dup; bool need_chain = false; kpfn = folio_pfn(kfolio); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; again: parent = NULL; new = &root->rb_node; while (*new) { struct folio *tree_folio; int ret; cond_resched(); stable_node = rb_entry(*new, struct ksm_stable_node, node); tree_folio = chain(&stable_node_dup, &stable_node, root); if (!tree_folio) { /* * If we walked over a stale stable_node, * ksm_get_folio() will call rb_erase() and it * may rebalance the tree from under us. So * restart the search from scratch. Returning * NULL would be safe too, but we'd generate * false negative insertions just because some * stable_node was stale. */ goto again; } ret = memcmp_pages(&kfolio->page, &tree_folio->page); folio_put(tree_folio); parent = *new; if (ret < 0) new = &parent->rb_left; else if (ret > 0) new = &parent->rb_right; else { need_chain = true; break; } } stable_node_dup = alloc_stable_node(); if (!stable_node_dup) return NULL; INIT_HLIST_HEAD(&stable_node_dup->hlist); stable_node_dup->kpfn = kpfn; stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); if (!need_chain) { rb_link_node(&stable_node_dup->node, parent, new); rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { free_stable_node(stable_node_dup); return NULL; } } stable_node_chain_add_dup(stable_node_dup, stable_node); } folio_set_stable_node(kfolio, stable_node_dup); return stable_node_dup; } /* * unstable_tree_search_insert - search for identical page, * else insert rmap_item into the unstable tree. * * This function searches for a page in the unstable tree identical to the * page currently being scanned; and if no identical page is found in the * tree, we insert rmap_item as a new object into the unstable tree. * * This function returns pointer to rmap_item found to be identical * to the currently scanned page, NULL otherwise. * * This function does both searching and inserting, because they share * the same walking algorithm in an rbtree. */ static struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { struct rb_node **new; struct rb_root *root; struct rb_node *parent = NULL; int nid; nid = get_kpfn_nid(page_to_pfn(page)); root = root_unstable_tree + nid; new = &root->rb_node; while (*new) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; /* * Don't substitute a ksm page for a forked page. */ if (page == tree_page) { put_page(tree_page); return NULL; } ret = memcmp_pages(page, tree_page); parent = *new; if (ret < 0) { put_page(tree_page); new = &parent->rb_left; } else if (ret > 0) { put_page(tree_page); new = &parent->rb_right; } else if (!ksm_merge_across_nodes && page_to_nid(tree_page) != nid) { /* * If tree_page has been migrated to another NUMA node, * it will be flushed out and put in the right unstable * tree next time: only merge with it when across_nodes. */ put_page(tree_page); return NULL; } else { *tree_pagep = tree_page; return tree_rmap_item; } } rmap_item->address |= UNSTABLE_FLAG; rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); DO_NUMA(rmap_item->nid = nid); rb_link_node(&rmap_item->node, parent, new); rb_insert_color(&rmap_item->node, root); ksm_pages_unshared++; return NULL; } /* * stable_tree_append - add another rmap_item to the linked list of * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ static void stable_tree_append(struct ksm_rmap_item *rmap_item, struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* * rmap won't find this mapping if we don't insert the * rmap_item in the right stable_node * duplicate. page_migration could break later if rmap breaks, * so we can as well crash here. We really need to check for * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check * for other negative values as an underflow if detected here * for the first time (and not when decreasing rmap_hlist_len) * would be sign of memory corruption in the stable_node. */ BUG_ON(stable_node->rmap_hlist_len < 0); stable_node->rmap_hlist_len++; if (!max_page_sharing_bypass) /* possibly non fatal but unexpected overflow, only warn */ WARN_ON_ONCE(stable_node->rmap_hlist_len > ksm_max_page_sharing); rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); if (rmap_item->hlist.next) ksm_pages_sharing++; else ksm_pages_shared++; rmap_item->mm->ksm_merging_pages++; } /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can * be inserted into the unstable tree, or merged with a page already there and * both transferred to the stable tree. * * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; struct folio *kfolio; unsigned int checksum; int err; bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != NUMA(stable_node->nid)) { stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; /* * If it's a KSM fork, allow it to go over the sharing limit * without warnings. */ if (!is_page_sharing_candidate(stable_node)) max_page_sharing_bypass = true; } else { remove_rmap_item_from_tree(rmap_item); /* * If the hash value of the page has changed from the last time * we calculated it, this page is changing frequently: therefore we * don't want to insert it in the unstable tree, and we don't want * to waste our time searching for something identical to it there. */ checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; return; } if (!try_to_merge_with_zero_page(rmap_item, page)) return; } /* Start by searching for the folio in the stable tree */ kfolio = stable_tree_search(page); if (&kfolio->page == page && rmap_item->head == stable_node) { folio_put(kfolio); return; } remove_rmap_item_from_tree(rmap_item); if (kfolio) { if (kfolio == ERR_PTR(-EBUSY)) return; err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ folio_lock(kfolio); stable_tree_append(rmap_item, folio_stable_node(kfolio), max_page_sharing_bypass); folio_unlock(kfolio); } folio_put(kfolio); return; } tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { bool split; kfolio = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); /* * If both pages we tried to merge belong to the same compound * page, then we actually ended up increasing the reference * count of the same compound page twice, and split_huge_page * failed. * Here we set a flag if that happened, and we use it later to * try split_huge_page again. Since we call put_page right * afterwards, the reference count will be correct and * split_huge_page should succeed. */ split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kfolio) { /* * The pages were successfully merged: insert new * node in the stable tree and add both rmap_items. */ folio_lock(kfolio); stable_node = stable_tree_insert(kfolio); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); stable_tree_append(rmap_item, stable_node, false); } folio_unlock(kfolio); /* * If we fail to insert the page into the stable tree, * we will have 2 virtual addresses that are pointing * to a ksm page left outside the stable tree, * in which case we need to break_cow on both. */ if (!stable_node) { break_cow(tree_rmap_item); break_cow(rmap_item); } } else if (split) { /* * We are here if we tried to merge two pages and * failed because they both belonged to the same * compound page. We will split the page now, but no * merging will take place. * We do not want to add the cost of a full lock; if * the page is locked, it is better to skip it and * perhaps try again later. */ if (!trylock_page(page)) return; split_huge_page(page); unlock_page(page); } } } static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, struct ksm_rmap_item **rmap_list, unsigned long addr) { struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); } rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; } return rmap_item; } /* * Calculate skip age for the ksm page age. The age determines how often * de-duplicating has already been tried unsuccessfully. If the age is * smaller, the scanning of this page is skipped for less scans. * * @age: rmap_item age of page */ static unsigned int skip_age(rmap_age_t age) { if (age <= 3) return 1; if (age <= 5) return 2; if (age <= 8) return 4; return 8; } /* * Determines if a page should be skipped for the current scan. * * @folio: folio containing the page to check * @rmap_item: associated rmap_item of page */ static bool should_skip_rmap_item(struct folio *folio, struct ksm_rmap_item *rmap_item) { rmap_age_t age; if (!ksm_smart_scan) return false; /* * Never skip pages that are already KSM; pages cmp_and_merge_page() * will essentially ignore them, but we still have to process them * properly. */ if (folio_test_ksm(folio)) return false; age = rmap_item->age; if (age != U8_MAX) rmap_item->age++; /* * Smaller ages are not skipped, they need to get a chance to go * through the different phases of the KSM merging. */ if (age < 3) return false; /* * Are we still allowed to skip? If not, then don't skip it * and determine how much more often we are allowed to skip next. */ if (!rmap_item->remaining_skips) { rmap_item->remaining_skips = skip_age(age); return false; } /* Skip this page */ ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; } static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct ksm_mm_slot *mm_slot; struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; mm_slot = ksm_scan.mm_slot; if (mm_slot == &ksm_mm_head) { advisor_start_scan(); trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); /* * A number of pages can hang around indefinitely in per-cpu * LRU cache, raised page count preventing write_protect_page * from merging them. Though it doesn't really matter much, * it is puzzling to see some stuck in pages_volatile until * other activity jostles them out, and they also prevented * LTP's KSM test from succeeding deterministically; so drain * them here (here rather than on entry to ksm_do_scan(), * so we don't IPI too often when pages_to_scan is set low). */ lru_add_drain_all(); /* * Whereas stale stable_nodes on the stable_tree itself * get pruned in the regular course of stable_tree_search(), * those moved out to the migrate_nodes list can accumulate: * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { struct ksm_stable_node *stable_node, *next; struct folio *folio; list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK); if (folio) folio_put(folio); cond_resched(); } } for (nid = 0; nid < ksm_nr_node_ids; nid++) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); mmap_read_lock(mm); if (ksm_test_exit(mm)) goto no_vmas; for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; if (!vma->anon_vma) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { struct page *tmp_page = NULL; struct folio_walk fw; struct folio *folio; if (ksm_test_exit(mm)) break; folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); if (folio) { if (!folio_is_zone_device(folio) && folio_test_anon(folio)) { folio_get(folio); tmp_page = fw.page; } folio_walk_end(&fw, vma); } if (tmp_page) { flush_anon_page(vma, tmp_page, ksm_scan.address); flush_dcache_page(tmp_page); rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; if (should_skip_rmap_item(folio, rmap_item)) { folio_put(folio); goto next_page; } ksm_scan.address += PAGE_SIZE; *page = tmp_page; } else { folio_put(folio); } mmap_read_unlock(mm); return rmap_item; } next_page: ksm_scan.address += PAGE_SIZE; cond_resched(); } } if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); slot = list_entry(mm_slot->slot.mm_node.next, struct mm_slot, mm_node); ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&mm_slot->slot.hash); list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); } else { mmap_read_unlock(mm); /* * mmap_read_unlock(mm) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and * ksm_scan.mm_slot doesn't point to it anymore. */ spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ mm_slot = ksm_scan.mm_slot; if (mm_slot != &ksm_mm_head) goto next_mm; advisor_stop_scan(); trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items); ksm_scan.seqnr++; return NULL; } /** * ksm_do_scan - the ksm scanner main worker function. * @scan_npages: number of pages we want to scan before we return. */ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); put_page(page); ksm_pages_scanned++; } } static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) { unsigned int sleep_ms; set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); if (ksmd_should_run()) { sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); wait_event_freezable_timeout(ksm_iter_wait, sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), msecs_to_jiffies(sleep_ms)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } return 0; } static void __ksm_add_vma(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; if (vm_flags & VM_MERGEABLE) return; if (vma_ksm_compatible(vma)) vm_flags_set(vma, VM_MERGEABLE); } static int __ksm_del_vma(struct vm_area_struct *vma) { int err; if (!(vma->vm_flags & VM_MERGEABLE)) return 0; if (vma->anon_vma) { err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } vm_flags_clear(vma, VM_MERGEABLE); return 0; } /** * ksm_add_vma - Mark vma as mergeable if compatible * * @vma: Pointer to vma */ void ksm_add_vma(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) __ksm_add_vma(vma); } static void ksm_add_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) __ksm_add_vma(vma); } static int ksm_del_vmas(struct mm_struct *mm) { struct vm_area_struct *vma; int err; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { err = __ksm_del_vma(vma); if (err) return err; } return 0; } /** * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all * compatible VMA's * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_enable_merge_any(struct mm_struct *mm) { int err; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } set_bit(MMF_VM_MERGE_ANY, &mm->flags); ksm_add_vmas(mm); return 0; } /** * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm, * previously enabled via ksm_enable_merge_any(). * * Disabling merging implies unmerging any merged pages, like setting * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and * merging on all compatible VMA's remains enabled. * * @mm: Pointer to mm * * Returns 0 on success, otherwise error code */ int ksm_disable_merge_any(struct mm_struct *mm) { int err; if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return 0; err = ksm_del_vmas(mm); if (err) { ksm_add_vmas(mm); return err; } clear_bit(MMF_VM_MERGE_ANY, &mm->flags); return 0; } int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) return 0; if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; switch (advice) { case MADV_MERGEABLE: if (vma->vm_flags & VM_MERGEABLE) return 0; if (!vma_ksm_compatible(vma)) return 0; if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; } return 0; } EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int needs_wakeup; mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; slot = &mm_slot->slot; /* Check ksm_run too? Would need tighter locking */ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. * * But when KSM_RUN_UNMERGE, it's important to insert ahead of its * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); trace_ksm_enter(mm); return 0; } void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; struct mm_slot *slot; int easy_to_free = 0; /* * This process is exiting: if it's straightforward (as is the * case when ksmd was never running), free mm_slot immediately. * But if it's at the cursor or has rmap_items linked to it, use * mmap_lock to synchronize with any break_cows before pagetables * are freed, and leave the mm_slot on the list for ksmd to free. * Beware: ksm may already have noticed it exiting and freed the slot. */ spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&slot->hash); list_del(&slot->mm_node); easy_to_free = 1; } else { list_move(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGE_ANY, &mm->flags); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); mmap_write_unlock(mm); } trace_ksm_exit(mm); } struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct page *page = folio_page(folio, 0); struct anon_vma *anon_vma = folio_anon_vma(folio); struct folio *new_folio; if (folio_test_large(folio)) return folio; if (folio_test_ksm(folio)) { if (folio_stable_node(folio) && !(ksm_run & KSM_RUN_UNMERGE)) return folio; /* no need to copy it */ } else if (!anon_vma) { return folio; /* no need to copy it */ } else if (folio->index == linear_page_index(vma, addr) && anon_vma->root == vma->anon_vma->root) { return folio; /* still no need to copy it */ } if (PageHWPoison(page)) return ERR_PTR(-EHWPOISON); if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); new_folio = NULL; } if (new_folio) { if (copy_mc_user_highpage(folio_page(new_folio, 0), page, addr, vma)) { folio_put(new_folio); return ERR_PTR(-EHWPOISON); } folio_set_dirty(new_folio); __folio_mark_uptodate(new_folio); __folio_set_locked(new_folio); #ifdef CONFIG_SWAP count_vm_event(KSM_SWPIN_COPY); #endif } return new_folio; } void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); /* * Rely on the page lock to protect against concurrent modifications * to that page's node of the stable tree. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); stable_node = folio_stable_node(folio); if (!stable_node) return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; struct anon_vma_chain *vmac; struct vm_area_struct *vma; cond_resched(); if (!anon_vma_trylock_read(anon_vma)) { if (rwc->try_lock) { rwc->contended = true; return; } anon_vma_lock_read(anon_vma); } anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { unsigned long addr; cond_resched(); vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this * rmap_item; but later, if there is still work to do, * we examine covering vmas in other mms: in case they * were forked from the original since ksmd passed. */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; } if (rwc->done && rwc->done(folio)) { anon_vma_unlock_read(anon_vma); return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; } #ifdef CONFIG_MEMORY_FAILURE /* * Collect processes when the error hit an ksm page. */ void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; struct ksm_rmap_item *rmap_item; struct vm_area_struct *vma; struct task_struct *tsk; stable_node = folio_stable_node(folio); if (!stable_node) return; hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *av = rmap_item->anon_vma; anon_vma_lock_read(av); rcu_read_lock(); for_each_process(tsk) { struct anon_vma_chain *vmac; unsigned long addr; struct task_struct *t = task_early_kill(tsk, force_early); if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0, ULONG_MAX) { vma = vmac->vma; if (vma->vm_mm == t->mm) { addr = rmap_item->address & PAGE_MASK; add_to_kill_ksm(t, page, vma, to_kill, addr); } } } rcu_read_unlock(); anon_vma_unlock_read(av); } } #endif #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); stable_node = folio_stable_node(folio); if (stable_node) { VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); stable_node->kpfn = folio_pfn(newfolio); /* * newfolio->mapping was set in advance; now we need smp_wmb() * to make sure that the new stable_node->kpfn is visible * to ksm_get_folio() before it can see that folio->mapping * has gone stale (or that the swapcache flag has been cleared). */ smp_wmb(); folio_set_stable_node(folio, NULL); } } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_HOTREMOVE static void wait_while_offlining(void) { while (ksm_run & KSM_RUN_OFFLINE) { mutex_unlock(&ksm_thread_mutex); wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), TASK_UNINTERRUPTIBLE); mutex_lock(&ksm_thread_mutex); } } static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) { /* * Don't ksm_get_folio, page has already gone: * which is why we keep kpfn instead of page* */ remove_node_from_stable_tree(stable_node); return true; } return false; } static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { VM_BUG_ON(is_stable_node_dup(stable_node)); return stable_node_dup_remove_range(stable_node, start_pfn, end_pfn); } hlist_for_each_entry_safe(dup, hlist_safe, &stable_node->hlist, hlist_dup) { VM_BUG_ON(!is_stable_node_dup(dup)); stable_node_dup_remove_range(dup, start_pfn, end_pfn); } if (hlist_empty(&stable_node->hlist)) { free_stable_node_chain(stable_node, root); return true; /* notify caller that tree was rebalanced */ } else return false; } static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + nid)) node = rb_first(root_stable_tree + nid); else node = rb_next(node); cond_resched(); } } list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); cond_resched(); } } static int ksm_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { struct memory_notify *mn = arg; switch (action) { case MEM_GOING_OFFLINE: /* * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() * and remove_all_stable_nodes() while memory is going offline: * it is unsafe for them to touch the stable tree at this time. * But unmerge_ksm_pages(), rmap lookups and other entry points * which do not need the ksm_thread_mutex are all safe. */ mutex_lock(&ksm_thread_mutex); ksm_run |= KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); break; case MEM_OFFLINE: /* * Most of the work is done by page migration; but there might * be a few stable_nodes left over, still pointing to struct * pages which have been offlined: prune those from the tree, * otherwise ksm_get_folio() might later try to access a * non-existent struct page. */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; mutex_unlock(&ksm_thread_mutex); smp_mb(); /* wake_up_bit advises this */ wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); break; } return NOTIFY_OK; } #else static void wait_while_offlining(void) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSFS /* * This all compiles without CONFIG_SYSFS, but is a waste of space. */ #define KSM_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) #define KSM_ATTR(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RW(_name) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; wake_up_interruptible(&ksm_iter_wait); return count; } KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int nr_pages; int err; if (ksm_advisor != KSM_ADVISOR_NONE) return -EINVAL; err = kstrtouint(buf, 10, &nr_pages); if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; return count; } KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int flags; int err; err = kstrtouint(buf, 10, &flags); if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; /* * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, * breaking COW to free the pages_shared (but leaves mm_slots * on the list for when ksmd may be set running again). */ mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; } } } mutex_unlock(&ksm_thread_mutex); if (flags & KSM_RUN_MERGE) wake_up_interruptible(&ksm_thread_wait); return count; } KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long knob; err = kstrtoul(buf, 10, &knob); if (err) return err; if (knob > 1) return -EINVAL; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_merge_across_nodes != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else if (root_stable_tree == one_stable_tree) { struct rb_root *buf; /* * This is the first time that we switch away from the * default of merging across nodes: must now allocate * a buffer to hold as many roots as may be needed. * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; else { root_stable_tree = buf; root_unstable_tree = buf + nr_node_ids; /* Stable tree is empty but not the unstable */ root_unstable_tree[0] = one_unstable_tree[0]; } } if (!err) { ksm_merge_across_nodes = knob; ksm_nr_node_ids = knob ? 1 : nr_node_ids; } } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_use_zero_pages = value; return count; } KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; int knob; err = kstrtoint(buf, 10, &knob); if (err) return err; /* * When a KSM page is created it is shared by 2 mappings. This * being a signed comparison, it implicitly verifies it's not * negative. */ if (knob < 2) return -EINVAL; if (READ_ONCE(ksm_max_page_sharing) == knob) return count; mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksm_max_page_sharing != knob) { if (ksm_pages_shared || remove_all_stable_nodes()) err = -EBUSY; else ksm_max_page_sharing = knob; } mutex_unlock(&ksm_thread_mutex); return err ? err : count; } KSM_ATTR(max_page_sharing); static ssize_t pages_scanned_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); } KSM_ATTR_RO(pages_scanned); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); static ssize_t pages_volatile_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long ksm_pages_volatile; ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared - ksm_pages_sharing - ksm_pages_unshared; /* * It was not worth any locking to calculate that statistic, * but it might therefore sometimes be negative: conceal that. */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t pages_skipped_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); } KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { long general_profit; general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); } KSM_ATTR_RO(general_profit); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); static ssize_t stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t stable_node_chains_prune_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { unsigned int msecs; int err; err = kstrtouint(buf, 10, &msecs); if (err) return -EINVAL; ksm_stable_node_chains_prune_millisecs = msecs; return count; } KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); static ssize_t smart_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_smart_scan); } static ssize_t smart_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; bool value; err = kstrtobool(buf, &value); if (err) return -EINVAL; ksm_smart_scan = value; return count; } KSM_ATTR(smart_scan); static ssize_t advisor_mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *output; if (ksm_advisor == KSM_ADVISOR_NONE) output = "[none] scan-time"; else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) output = "none [scan-time]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t advisor_mode_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { enum ksm_advisor_type curr_advisor = ksm_advisor; if (sysfs_streq("scan-time", buf)) ksm_advisor = KSM_ADVISOR_SCAN_TIME; else if (sysfs_streq("none", buf)) ksm_advisor = KSM_ADVISOR_NONE; else return -EINVAL; /* Set advisor default values */ if (curr_advisor != ksm_advisor) set_advisor_defaults(); return count; } KSM_ATTR(advisor_mode); static ssize_t advisor_max_cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu); } static ssize_t advisor_max_cpu_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_cpu = value; return count; } KSM_ATTR(advisor_max_cpu); static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan); } static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_min_pages_to_scan = value; return count; } KSM_ATTR(advisor_min_pages_to_scan); static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan); } static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; ksm_advisor_max_pages_to_scan = value; return count; } KSM_ATTR(advisor_max_pages_to_scan); static ssize_t advisor_target_scan_time_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time); } static ssize_t advisor_target_scan_time_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int err; unsigned long value; err = kstrtoul(buf, 10, &value); if (err) return -EINVAL; if (value < 1) return -EINVAL; ksm_advisor_target_scan_time = value; return count; } KSM_ATTR(advisor_target_scan_time); static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif &max_page_sharing_attr.attr, &stable_node_chains_attr.attr, &stable_node_dups_attr.attr, &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, &smart_scan_attr.attr, &advisor_mode_attr.attr, &advisor_max_cpu_attr.attr, &advisor_min_pages_to_scan_attr.attr, &advisor_max_pages_to_scan_attr.attr, &advisor_target_scan_time_attr.attr, NULL, }; static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; #endif /* CONFIG_SYSFS */ static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; /* The correct value depends on page size and endianness */ zero_checksum = calc_checksum(ZERO_PAGE(0)); /* Default to false for backwards compatibility */ ksm_use_zero_pages = false; err = ksm_slab_init(); if (err) goto out; ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); if (err) { pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; out_free: ksm_slab_free(); out: return err; } subsys_initcall(ksm_init); |
340 5 447 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 | #ifndef _LINUX_GENERIC_RADIX_TREE_H #define _LINUX_GENERIC_RADIX_TREE_H /** * DOC: Generic radix trees/sparse arrays * * Very simple and minimalistic, supporting arbitrary size entries up to * GENRADIX_NODE_SIZE. * * A genradix is defined with the type it will store, like so: * * static GENRADIX(struct foo) foo_genradix; * * The main operations are: * * - genradix_init(radix) - initialize an empty genradix * * - genradix_free(radix) - free all memory owned by the genradix and * reinitialize it * * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning * NULL if that entry does not exist * * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry, * allocating it if necessary * * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix * * The radix tree allocates one page of entries at a time, so entries may exist * that were never explicitly allocated - they will be initialized to all * zeroes. * * Internally, a genradix is just a radix tree of pages, and indexing works in * terms of byte offsets. The wrappers in this header file use sizeof on the * type the radix contains to calculate a byte offset from the index - see * __idx_to_offset. */ #include <asm/page.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/log2.h> #include <linux/math.h> #include <linux/slab.h> #include <linux/types.h> struct genradix_root; #define GENRADIX_NODE_SHIFT 9 #define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) #define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) /* depth that's needed for a genradix that can address up to ULONG_MAX: */ #define GENRADIX_MAX_DEPTH \ DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) static inline int genradix_depth_shift(unsigned depth) { return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; } /* * Returns size (of data, in bytes) that a tree of a given depth holds: */ static inline size_t genradix_depth_size(unsigned depth) { return 1UL << genradix_depth_shift(depth); } static inline unsigned genradix_root_to_depth(struct genradix_root *r) { return (unsigned long) r & GENRADIX_DEPTH_MASK; } static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) { return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } struct __genradix { struct genradix_root *root; }; struct genradix_node { union { /* Interior node: */ struct genradix_node *children[GENRADIX_ARY]; /* Leaf: */ u8 data[GENRADIX_NODE_SIZE]; }; }; static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) { return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); } static inline void genradix_free_node(struct genradix_node *node) { kfree(node); } /* * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: */ #define __GENRADIX_INITIALIZER \ { \ .tree = { \ .root = NULL, \ } \ } /* * We use a 0 size array to stash the type we're storing without taking any * space at runtime - then the various accessor macros can use typeof() to get * to it for casts/sizeof - we also force the alignment so that storing a type * with a ridiculous alignment doesn't blow up the alignment or size of the * genradix. */ #define GENRADIX(_type) \ struct { \ struct __genradix tree; \ _type type[0] __aligned(1); \ } #define DEFINE_GENRADIX(_name, _type) \ GENRADIX(_type) _name = __GENRADIX_INITIALIZER /** * genradix_init - initialize a genradix * @_radix: genradix to initialize * * Does not fail */ #define genradix_init(_radix) \ do { \ *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \ } while (0) void __genradix_free(struct __genradix *); /** * genradix_free: free all memory owned by a genradix * @_radix: the genradix to free * * After freeing, @_radix will be reinitialized and empty */ #define genradix_free(_radix) __genradix_free(&(_radix)->tree) static inline size_t __idx_to_offset(size_t idx, size_t obj_size) { if (__builtin_constant_p(obj_size)) BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE); else BUG_ON(obj_size > GENRADIX_NODE_SIZE); if (!is_power_of_2(obj_size)) { size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size; return (idx / objs_per_page) * GENRADIX_NODE_SIZE + (idx % objs_per_page) * obj_size; } else { return idx * obj_size; } } #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) #define __genradix_objs_per_page(_radix) \ (GENRADIX_NODE_SIZE / sizeof((_radix)->type[0])) #define __genradix_page_remainder(_radix) \ (GENRADIX_NODE_SIZE % sizeof((_radix)->type[0])) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset) { struct genradix_root *r = READ_ONCE(radix->root); struct genradix_node *n = genradix_root_to_node(r); unsigned level = genradix_root_to_depth(r); unsigned shift = genradix_depth_shift(level); if (unlikely(ilog2(offset) >= genradix_depth_shift(level))) return NULL; while (n && shift > GENRADIX_NODE_SHIFT) { shift -= GENRADIX_ARY_SHIFT; n = n->children[offset >> shift]; offset &= (1UL << shift) - 1; } return n ? &n->data[offset] : NULL; } #define genradix_ptr_inlined(_radix, _idx) \ (__genradix_cast(_radix) \ __genradix_ptr_inlined(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx))) void *__genradix_ptr(struct __genradix *, size_t); /** * genradix_ptr - get a pointer to a genradix entry * @_radix: genradix to access * @_idx: index to fetch * * Returns a pointer to entry at @_idx, or NULL if that entry does not exist. */ #define genradix_ptr(_radix, _idx) \ (__genradix_cast(_radix) \ __genradix_ptr(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx))) void *__genradix_ptr_alloc(struct __genradix *, size_t, struct genradix_node **, gfp_t); #define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ (__genradix_cast(_radix) \ (__genradix_ptr_inlined(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx)) ?: \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ NULL, _gfp))) #define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\ (__genradix_cast(_radix) \ (__genradix_ptr_inlined(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx)) ?: \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ _new_node, _gfp))) /** * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it * if necessary * @_radix: genradix to access * @_idx: index to fetch * @_gfp: gfp mask * * Returns a pointer to entry at @_idx, or NULL on allocation failure */ #define genradix_ptr_alloc(_radix, _idx, _gfp) \ (__genradix_cast(_radix) \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ NULL, _gfp)) #define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\ (__genradix_cast(_radix) \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ _new_node, _gfp)) struct genradix_iter { size_t offset; size_t pos; }; /** * genradix_iter_init - initialize a genradix_iter * @_radix: genradix that will be iterated over * @_idx: index to start iterating from */ #define genradix_iter_init(_radix, _idx) \ ((struct genradix_iter) { \ .pos = (_idx), \ .offset = __genradix_idx_to_offset((_radix), (_idx)),\ }) void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); /** * genradix_iter_peek - get first entry at or above iterator's current * position * @_iter: a genradix_iter * @_radix: genradix being iterated over * * If no more entries exist at or above @_iter's current position, returns NULL */ #define genradix_iter_peek(_iter, _radix) \ (__genradix_cast(_radix) \ __genradix_iter_peek(_iter, &(_radix)->tree, \ __genradix_objs_per_page(_radix))) void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *, size_t, size_t); /** * genradix_iter_peek_prev - get first entry at or below iterator's current * position * @_iter: a genradix_iter * @_radix: genradix being iterated over * * If no more entries exist at or below @_iter's current position, returns NULL */ #define genradix_iter_peek_prev(_iter, _radix) \ (__genradix_cast(_radix) \ __genradix_iter_peek_prev(_iter, &(_radix)->tree, \ __genradix_objs_per_page(_radix), \ __genradix_obj_size(_radix) + \ __genradix_page_remainder(_radix))) static inline void __genradix_iter_advance(struct genradix_iter *iter, size_t obj_size) { if (iter->offset + obj_size < iter->offset) { iter->offset = SIZE_MAX; iter->pos = SIZE_MAX; return; } iter->offset += obj_size; if (!is_power_of_2(obj_size) && (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE) iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE); iter->pos++; } #define genradix_iter_advance(_iter, _radix) \ __genradix_iter_advance(_iter, __genradix_obj_size(_radix)) static inline void __genradix_iter_rewind(struct genradix_iter *iter, size_t obj_size) { if (iter->offset == 0 || iter->offset == SIZE_MAX) { iter->offset = SIZE_MAX; return; } if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0) iter->offset -= GENRADIX_NODE_SIZE % obj_size; iter->offset -= obj_size; iter->pos--; } #define genradix_iter_rewind(_iter, _radix) \ __genradix_iter_rewind(_iter, __genradix_obj_size(_radix)) #define genradix_for_each_from(_radix, _iter, _p, _start) \ for (_iter = genradix_iter_init(_radix, _start); \ (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \ genradix_iter_advance(&_iter, _radix)) /** * genradix_for_each - iterate over entry in a genradix * @_radix: genradix to iterate over * @_iter: a genradix_iter to track current position * @_p: pointer to genradix entry type * * On every iteration, @_p will point to the current entry, and @_iter.pos * will be the current entry's index. */ #define genradix_for_each(_radix, _iter, _p) \ genradix_for_each_from(_radix, _iter, _p, 0) #define genradix_last_pos(_radix) \ (SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1) /** * genradix_for_each_reverse - iterate over entry in a genradix, reverse order * @_radix: genradix to iterate over * @_iter: a genradix_iter to track current position * @_p: pointer to genradix entry type * * On every iteration, @_p will point to the current entry, and @_iter.pos * will be the current entry's index. */ #define genradix_for_each_reverse(_radix, _iter, _p) \ for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\ (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\ genradix_iter_rewind(&_iter, _radix)) int __genradix_prealloc(struct __genradix *, size_t, gfp_t); /** * genradix_prealloc - preallocate entries in a generic radix tree * @_radix: genradix to preallocate * @_nr: number of entries to preallocate * @_gfp: gfp mask * * Returns 0 on success, -ENOMEM on failure */ #define genradix_prealloc(_radix, _nr, _gfp) \ __genradix_prealloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _nr + 1),\ _gfp) #endif /* _LINUX_GENERIC_RADIX_TREE_H */ |
6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_IO_READ_H #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" struct bch_read_bio { struct bch_fs *c; u64 start_time; u64 submit_time; /* * Reads will often have to be split, and if the extent being read from * was checksummed or compressed we'll also have to allocate bounce * buffers and copy the data back into the original bio. * * If we didn't have to split, we have to save and restore the original * bi_end_io - @split below indicates which: */ union { struct bch_read_bio *parent; bio_end_io_t *end_io; }; /* * Saved copy of bio->bi_iter, from submission time - allows us to * resubmit on IO error, and also to copy data back to the original bio * when we're bouncing: */ struct bvec_iter bvec_iter; unsigned offset_into_extent; u16 flags; union { struct { u16 bounce:1, split:1, kmalloc:1, have_ioref:1, narrow_crcs:1, hole:1, retry:2, context:2; }; u16 _state; }; struct bch_devs_list devs_have; struct extent_ptr_decoded pick; /* * pos we read from - different from data_pos for indirect extents: */ u32 subvol; struct bpos read_pos; /* * start pos of data we read (may not be pos of data we want) - for * promote, narrow extents paths: */ enum btree_id data_btree; struct bpos data_pos; struct bversion version; struct promote_op *promote; struct bch_io_opts opts; struct work_struct work; struct bio bio; }; #define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, struct bkey_buf *); static inline int bch2_read_indirect_extent(struct btree_trans *trans, enum btree_id *data_btree, unsigned *offset_into_extent, struct bkey_buf *k) { if (k->k->k.type != KEY_TYPE_reflink_p) return 0; *data_btree = BTREE_ID_reflink; return __bch2_read_indirect_extent(trans, offset_into_extent, k); } enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, BCH_READ_MAY_PROMOTE = 1 << 1, BCH_READ_USER_MAPPED = 1 << 2, BCH_READ_NODECODE = 1 << 3, BCH_READ_LAST_FRAGMENT = 1 << 4, /* internal: */ BCH_READ_MUST_BOUNCE = 1 << 5, BCH_READ_MUST_CLONE = 1 << 6, BCH_READ_IN_RETRY = 1 << 7, }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bpos, enum btree_id, struct bkey_s_c, unsigned, struct bch_io_failures *, unsigned); static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, unsigned flags) { __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, data_btree, k, offset_into_extent, NULL, flags); } void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { struct bch_io_failures failed = { .nr = 0 }; BUG_ON(rbio->_state); rbio->c = c; rbio->start_time = local_clock(); rbio->subvol = inum.subvol; __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED); } static inline struct bch_read_bio *rbio_init(struct bio *bio, struct bch_io_opts opts) { struct bch_read_bio *rbio = to_rbio(bio); rbio->_state = 0; rbio->promote = NULL; rbio->opts = opts; return rbio; } void bch2_fs_io_read_exit(struct bch_fs *); int bch2_fs_io_read_init(struct bch_fs *); #endif /* _BCACHEFS_IO_READ_H */ |
377 375 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | // SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * Implements 3 trace clock variants, with differing scalability/precision * tradeoffs: * * - local: CPU-local trace clock * - medium: scalable global clock with some jitter * - global: globally monotonic, serialized clock * * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> #include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> /* * trace_clock_local(): the simplest and least coherent tracing clock. * * Useful for tracing that does not cross to other CPUs nor * does it go through idle events. */ u64 notrace trace_clock_local(void) { u64 clock; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ preempt_disable_notrace(); clock = sched_clock(); preempt_enable_notrace(); return clock; } EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of * jitter between CPUs. So it's a pretty scalable clock, but there * can be offsets in the trace data. */ u64 notrace trace_clock(void) { return local_clock(); } EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. * Note that this use of jiffies_64 is not completely safe on * 32-bit systems. But the window is tiny, and the effect if * we are affected is that we will have an obviously bogus * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock * * It has higher overhead than the other trace clocks but is still * an order of magnitude faster than GTOD derived hardware clocks. * * Used by plugins that need globally coherent timestamps. */ /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); /* * The global clock "guarantees" that the events are ordered * between CPUs. But if two events on two different CPUS call * trace_clock_global at roughly the same time, it really does * not matter which one gets the earlier time. Just make sure * that the same CPU will always show a monotonic clock. * * Use a read memory barrier to get the latest written * time that was recorded. */ smp_rmb(); prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) now = prev_time; /* * If in an NMI context then dont risk lockups and simply return * the current time. */ if (unlikely(in_nmi())) goto out; /* Tracing can cause strange recursion, always use a try lock */ if (arch_spin_trylock(&trace_clock_struct.lock)) { /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) now = prev_time; trace_clock_struct.prev_time = now; /* The unlock acts as the wmb for the above rmb */ arch_spin_unlock(&trace_clock_struct.lock); } out: raw_local_irq_restore(flags); return now; } EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; /* * trace_clock_counter(): simply an atomic counter. * Use the trace_counter "counter" for cases where you do not care * about timings, but are interested in strict ordering. */ u64 notrace trace_clock_counter(void) { return atomic64_inc_return(&trace_counter); } |
1 7 6 6 4 4 4 1 4 1 1 1 7 7 7 7 1 6 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Jean-Fredric Clere, Nikolas Zimmermann, Georg Acher * Mark Cave-Ayland, Carlo E Prelz, Dick Streefland * Copyright (c) 2002, 2003 Tuukka Toivonen * Copyright (c) 2008 Erik Andrén * * P/N 861037: Sensor HDCS1000 ASIC STV0600 * P/N 861050-0010: Sensor HDCS1000 ASIC STV0600 * P/N 861050-0020: Sensor Photobit PB100 ASIC STV0600-1 - QuickCam Express * P/N 861055: Sensor ST VV6410 ASIC STV0610 - LEGO cam * P/N 861075-0040: Sensor HDCS1000 ASIC * P/N 961179-0700: Sensor ST VV6410 ASIC STV0602 - Dexxa WebCam USB * P/N 861040-0000: Sensor ST VV6410 ASIC STV0610 - QuickCam Web */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/input.h> #include "stv06xx_sensor.h" MODULE_AUTHOR("Erik Andrén"); MODULE_DESCRIPTION("STV06XX USB Camera Driver"); MODULE_LICENSE("GPL"); static bool dump_bridge; static bool dump_sensor; int stv06xx_write_bridge(struct sd *sd, u16 address, u16 i2c_data) { int err; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; u8 len = (i2c_data > 0xff) ? 2 : 1; buf[0] = i2c_data & 0xff; buf[1] = (i2c_data >> 8) & 0xff; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, address, 0, buf, len, STV06XX_URB_MSG_TIMEOUT); gspca_dbg(gspca_dev, D_CONF, "Written 0x%x to address 0x%x, status: %d\n", i2c_data, address, err); return (err < 0) ? err : 0; } int stv06xx_read_bridge(struct sd *sd, u16 address, u8 *i2c_data) { int err; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; err = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0x04, 0xc0, address, 0, buf, 1, STV06XX_URB_MSG_TIMEOUT); *i2c_data = buf[0]; gspca_dbg(gspca_dev, D_CONF, "Reading 0x%x from address 0x%x, status %d\n", *i2c_data, address, err); return (err < 0) ? err : 0; } /* Wraps the normal write sensor bytes / words functions for writing a single value */ int stv06xx_write_sensor(struct sd *sd, u8 address, u16 value) { if (sd->sensor->i2c_len == 2) { u16 data[2] = { address, value }; return stv06xx_write_sensor_words(sd, data, 1); } else { u8 data[2] = { address, value }; return stv06xx_write_sensor_bytes(sd, data, 1); } } static int stv06xx_write_sensor_finish(struct sd *sd) { int err = 0; if (sd->bridge == BRIDGE_STV610) { struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; buf[0] = 0; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x1704, 0, buf, 1, STV06XX_URB_MSG_TIMEOUT); } return (err < 0) ? err : 0; } int stv06xx_write_sensor_bytes(struct sd *sd, const u8 *data, u8 len) { int err, i, j; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; gspca_dbg(gspca_dev, D_CONF, "I2C: Command buffer contains %d entries\n", len); for (i = 0; i < len;) { /* Build the command buffer */ memset(buf, 0, I2C_BUFFER_LENGTH); for (j = 0; j < I2C_MAX_BYTES && i < len; j++, i++) { buf[j] = data[2*i]; buf[0x10 + j] = data[2*i+1]; gspca_dbg(gspca_dev, D_CONF, "I2C: Writing 0x%02x to reg 0x%02x\n", data[2*i+1], data[2*i]); } buf[0x20] = sd->sensor->i2c_addr; buf[0x21] = j - 1; /* Number of commands to send - 1 */ buf[0x22] = I2C_WRITE_CMD; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x0400, 0, buf, I2C_BUFFER_LENGTH, STV06XX_URB_MSG_TIMEOUT); if (err < 0) return err; } return stv06xx_write_sensor_finish(sd); } int stv06xx_write_sensor_words(struct sd *sd, const u16 *data, u8 len) { int err, i, j; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; gspca_dbg(gspca_dev, D_CONF, "I2C: Command buffer contains %d entries\n", len); for (i = 0; i < len;) { /* Build the command buffer */ memset(buf, 0, I2C_BUFFER_LENGTH); for (j = 0; j < I2C_MAX_WORDS && i < len; j++, i++) { buf[j] = data[2*i]; buf[0x10 + j * 2] = data[2*i+1]; buf[0x10 + j * 2 + 1] = data[2*i+1] >> 8; gspca_dbg(gspca_dev, D_CONF, "I2C: Writing 0x%04x to reg 0x%02x\n", data[2*i+1], data[2*i]); } buf[0x20] = sd->sensor->i2c_addr; buf[0x21] = j - 1; /* Number of commands to send - 1 */ buf[0x22] = I2C_WRITE_CMD; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x0400, 0, buf, I2C_BUFFER_LENGTH, STV06XX_URB_MSG_TIMEOUT); if (err < 0) return err; } return stv06xx_write_sensor_finish(sd); } int stv06xx_read_sensor(struct sd *sd, const u8 address, u16 *value) { int err; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; err = stv06xx_write_bridge(sd, STV_I2C_FLUSH, sd->sensor->i2c_flush); if (err < 0) return err; /* Clear mem */ memset(buf, 0, I2C_BUFFER_LENGTH); buf[0] = address; buf[0x20] = sd->sensor->i2c_addr; buf[0x21] = 0; /* Read I2C register */ buf[0x22] = I2C_READ_CMD; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x1400, 0, buf, I2C_BUFFER_LENGTH, STV06XX_URB_MSG_TIMEOUT); if (err < 0) { pr_err("I2C: Read error writing address: %d\n", err); return err; } err = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0x04, 0xc0, 0x1410, 0, buf, sd->sensor->i2c_len, STV06XX_URB_MSG_TIMEOUT); if (sd->sensor->i2c_len == 2) *value = buf[0] | (buf[1] << 8); else *value = buf[0]; gspca_dbg(gspca_dev, D_CONF, "I2C: Read 0x%x from address 0x%x, status: %d\n", *value, address, err); return (err < 0) ? err : 0; } /* Dumps all bridge registers */ static void stv06xx_dump_bridge(struct sd *sd) { int i; u8 data, buf; pr_info("Dumping all stv06xx bridge registers\n"); for (i = 0x1400; i < 0x160f; i++) { stv06xx_read_bridge(sd, i, &data); pr_info("Read 0x%x from address 0x%x\n", data, i); } pr_info("Testing stv06xx bridge registers for writability\n"); for (i = 0x1400; i < 0x160f; i++) { stv06xx_read_bridge(sd, i, &data); buf = data; stv06xx_write_bridge(sd, i, 0xff); stv06xx_read_bridge(sd, i, &data); if (data == 0xff) pr_info("Register 0x%x is read/write\n", i); else if (data != buf) pr_info("Register 0x%x is read/write, but only partially\n", i); else pr_info("Register 0x%x is read-only\n", i); stv06xx_write_bridge(sd, i, buf); } } /* this function is called at probe and resume time */ static int stv06xx_init(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int err; gspca_dbg(gspca_dev, D_PROBE, "Initializing camera\n"); /* Let the usb init settle for a bit before performing the initialization */ msleep(250); err = sd->sensor->init(sd); if (dump_sensor && sd->sensor->dump) sd->sensor->dump(sd); return (err < 0) ? err : 0; } /* this function is called at probe time */ static int stv06xx_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_PROBE, "Initializing controls\n"); gspca_dev->vdev.ctrl_handler = &gspca_dev->ctrl_handler; return sd->sensor->init_controls(sd); } /* Start the camera */ static int stv06xx_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; struct usb_host_interface *alt; struct usb_interface *intf; int err, packet_size; intf = usb_ifnum_to_if(sd->gspca_dev.dev, sd->gspca_dev.iface); alt = usb_altnum_to_altsetting(intf, sd->gspca_dev.alt); if (!alt) { gspca_err(gspca_dev, "Couldn't get altsetting\n"); return -EIO; } if (alt->desc.bNumEndpoints < 1) return -ENODEV; packet_size = le16_to_cpu(alt->endpoint[0].desc.wMaxPacketSize); err = stv06xx_write_bridge(sd, STV_ISO_SIZE_L, packet_size); if (err < 0) return err; /* Prepare the sensor for start */ err = sd->sensor->start(sd); if (err < 0) goto out; /* Start isochronous streaming */ err = stv06xx_write_bridge(sd, STV_ISO_ENABLE, 1); out: if (err < 0) gspca_dbg(gspca_dev, D_STREAM, "Starting stream failed\n"); else gspca_dbg(gspca_dev, D_STREAM, "Started streaming\n"); return (err < 0) ? err : 0; } static int stv06xx_isoc_init(struct gspca_dev *gspca_dev) { struct usb_interface_cache *intfc; struct usb_host_interface *alt; struct sd *sd = (struct sd *) gspca_dev; intfc = gspca_dev->dev->actconfig->intf_cache[0]; if (intfc->num_altsetting < 2) return -ENODEV; alt = &intfc->altsetting[1]; if (alt->desc.bNumEndpoints < 1) return -ENODEV; /* Start isoc bandwidth "negotiation" at max isoc bandwidth */ alt->endpoint[0].desc.wMaxPacketSize = cpu_to_le16(sd->sensor->max_packet_size[gspca_dev->curr_mode]); return 0; } static int stv06xx_isoc_nego(struct gspca_dev *gspca_dev) { int ret, packet_size, min_packet_size; struct usb_host_interface *alt; struct sd *sd = (struct sd *) gspca_dev; /* * Existence of altsetting and endpoint was verified in * stv06xx_isoc_init() */ alt = &gspca_dev->dev->actconfig->intf_cache[0]->altsetting[1]; packet_size = le16_to_cpu(alt->endpoint[0].desc.wMaxPacketSize); min_packet_size = sd->sensor->min_packet_size[gspca_dev->curr_mode]; if (packet_size <= min_packet_size) return -EIO; packet_size -= 100; if (packet_size < min_packet_size) packet_size = min_packet_size; alt->endpoint[0].desc.wMaxPacketSize = cpu_to_le16(packet_size); ret = usb_set_interface(gspca_dev->dev, gspca_dev->iface, 1); if (ret < 0) gspca_err(gspca_dev, "set alt 1 err %d\n", ret); return ret; } static void stv06xx_stopN(struct gspca_dev *gspca_dev) { int err; struct sd *sd = (struct sd *) gspca_dev; /* stop ISO-streaming */ err = stv06xx_write_bridge(sd, STV_ISO_ENABLE, 0); if (err < 0) goto out; err = sd->sensor->stop(sd); out: if (err < 0) gspca_dbg(gspca_dev, D_STREAM, "Failed to stop stream\n"); else gspca_dbg(gspca_dev, D_STREAM, "Stopped streaming\n"); } /* * Analyse an USB packet of the data stream and store it appropriately. * Each packet contains an integral number of chunks. Each chunk has * 2-bytes identification, followed by 2-bytes that describe the chunk * length. Known/guessed chunk identifications are: * 8001/8005/C001/C005 - Begin new frame * 8002/8006/C002/C006 - End frame * 0200/4200 - Contains actual image data, bayer or compressed * 0005 - 11 bytes of unknown data * 0100 - 2 bytes of unknown data * The 0005 and 0100 chunks seem to appear only in compressed stream. */ static void stv06xx_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_PACK, "Packet of length %d arrived\n", len); /* A packet may contain several frames loop until the whole packet is reached */ while (len) { int id, chunk_len; if (len < 4) { gspca_dbg(gspca_dev, D_PACK, "Packet is smaller than 4 bytes\n"); return; } /* Capture the id */ id = (data[0] << 8) | data[1]; /* Capture the chunk length */ chunk_len = (data[2] << 8) | data[3]; gspca_dbg(gspca_dev, D_PACK, "Chunk id: %x, length: %d\n", id, chunk_len); data += 4; len -= 4; if (len < chunk_len) { gspca_err(gspca_dev, "URB packet length is smaller than the specified chunk length\n"); gspca_dev->last_packet_type = DISCARD_PACKET; return; } /* First byte seem to be 02=data 2nd byte is unknown??? */ if (sd->bridge == BRIDGE_ST6422 && (id & 0xff00) == 0x0200) goto frame_data; switch (id) { case 0x0200: case 0x4200: frame_data: gspca_dbg(gspca_dev, D_PACK, "Frame data packet detected\n"); if (sd->to_skip) { int skip = (sd->to_skip < chunk_len) ? sd->to_skip : chunk_len; data += skip; len -= skip; chunk_len -= skip; sd->to_skip -= skip; } gspca_frame_add(gspca_dev, INTER_PACKET, data, chunk_len); break; case 0x8001: case 0x8005: case 0xc001: case 0xc005: gspca_dbg(gspca_dev, D_PACK, "Starting new frame\n"); /* Create a new frame, chunk length should be zero */ gspca_frame_add(gspca_dev, FIRST_PACKET, NULL, 0); if (sd->bridge == BRIDGE_ST6422) sd->to_skip = gspca_dev->pixfmt.width * 4; if (chunk_len) gspca_err(gspca_dev, "Chunk length is non-zero on a SOF\n"); break; case 0x8002: case 0x8006: case 0xc002: gspca_dbg(gspca_dev, D_PACK, "End of frame detected\n"); /* Complete the last frame (if any) */ gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); if (chunk_len) gspca_err(gspca_dev, "Chunk length is non-zero on a EOF\n"); break; case 0x0005: gspca_dbg(gspca_dev, D_PACK, "Chunk 0x005 detected\n"); /* Unknown chunk with 11 bytes of data, occurs just before end of each frame in compressed mode */ break; case 0x0100: gspca_dbg(gspca_dev, D_PACK, "Chunk 0x0100 detected\n"); /* Unknown chunk with 2 bytes of data, occurs 2-3 times per USB interrupt */ break; case 0x42ff: gspca_dbg(gspca_dev, D_PACK, "Chunk 0x42ff detected\n"); /* Special chunk seen sometimes on the ST6422 */ break; default: gspca_dbg(gspca_dev, D_PACK, "Unknown chunk 0x%04x detected\n", id); /* Unknown chunk */ } data += chunk_len; len -= chunk_len; } } #if IS_ENABLED(CONFIG_INPUT) static int sd_int_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* interrupt packet data */ int len) /* interrupt packet length */ { int ret = -EINVAL; if (len == 1 && (data[0] == 0x80 || data[0] == 0x10)) { input_report_key(gspca_dev->input_dev, KEY_CAMERA, 1); input_sync(gspca_dev->input_dev); ret = 0; } if (len == 1 && (data[0] == 0x88 || data[0] == 0x11)) { input_report_key(gspca_dev->input_dev, KEY_CAMERA, 0); input_sync(gspca_dev->input_dev); ret = 0; } return ret; } #endif static int stv06xx_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id); static void stv06xx_probe_error(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *)gspca_dev; kfree(sd->sensor_priv); sd->sensor_priv = NULL; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = stv06xx_config, .init = stv06xx_init, .init_controls = stv06xx_init_controls, .probe_error = stv06xx_probe_error, .start = stv06xx_start, .stopN = stv06xx_stopN, .pkt_scan = stv06xx_pkt_scan, .isoc_init = stv06xx_isoc_init, .isoc_nego = stv06xx_isoc_nego, #if IS_ENABLED(CONFIG_INPUT) .int_pkt_scan = sd_int_pkt_scan, #endif }; /* This function is called at probe time */ static int stv06xx_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_PROBE, "Configuring camera\n"); sd->bridge = id->driver_info; gspca_dev->sd_desc = &sd_desc; if (dump_bridge) stv06xx_dump_bridge(sd); sd->sensor = &stv06xx_sensor_st6422; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_vv6410; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_hdcs1x00; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_hdcs1020; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_pb0100; if (!sd->sensor->probe(sd)) return 0; sd->sensor = NULL; return -ENODEV; } /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x046d, 0x0840), .driver_info = BRIDGE_STV600 }, /* QuickCam Express */ {USB_DEVICE(0x046d, 0x0850), .driver_info = BRIDGE_STV610 }, /* LEGO cam / QuickCam Web */ {USB_DEVICE(0x046d, 0x0870), .driver_info = BRIDGE_STV602 }, /* Dexxa WebCam USB */ {USB_DEVICE(0x046D, 0x08F0), .driver_info = BRIDGE_ST6422 }, /* QuickCam Messenger */ {USB_DEVICE(0x046D, 0x08F5), .driver_info = BRIDGE_ST6422 }, /* QuickCam Communicate */ {USB_DEVICE(0x046D, 0x08F6), .driver_info = BRIDGE_ST6422 }, /* QuickCam Messenger (new) */ {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static void sd_disconnect(struct usb_interface *intf) { struct gspca_dev *gspca_dev = usb_get_intfdata(intf); struct sd *sd = (struct sd *) gspca_dev; void *priv = sd->sensor_priv; gspca_dbg(gspca_dev, D_PROBE, "Disconnecting the stv06xx device\n"); sd->sensor = NULL; gspca_disconnect(intf); kfree(priv); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = sd_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); module_param(dump_bridge, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dump_bridge, "Dumps all usb bridge registers at startup"); module_param(dump_sensor, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dump_sensor, "Dumps all sensor registers at startup"); |
33 33 33 1 33 33 1 33 32 25 33 33 25 25 33 8 33 33 33 33 32 33 30 33 33 33 33 33 33 33 33 33 33 28 28 23 33 33 33 33 33 33 33 33 33 33 3 3 2 2 33 29 2 2 2 2 2 33 33 30 3 3 3 33 73 73 73 72 73 72 73 73 73 73 8 8 65 5 67 8 33 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 | // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> #include <linux/array_size.h> #include <linux/bitmap.h> #include <linux/cleanup.h> #include <linux/compat.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/module.h> #include <linux/nospec.h> #include <linux/of.h> #include <linux/pinctrl/consumer.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/srcu.h> #include <linux/string.h> #include <linux/gpio.h> #include <linux/gpio/driver.h> #include <linux/gpio/machine.h> #include <uapi/linux/gpio.h> #include "gpiolib-acpi.h" #include "gpiolib-cdev.h" #include "gpiolib-of.h" #include "gpiolib-swnode.h" #include "gpiolib-sysfs.h" #include "gpiolib.h" #define CREATE_TRACE_POINTS #include <trace/events/gpio.h> /* Implementation infrastructure for GPIO interfaces. * * The GPIO programming interface allows for inlining speed-critical * get/set operations for common cases, so that access to SOC-integrated * GPIOs can sometimes cost only an instruction or two per bit. */ /* Device and char device-related information */ static DEFINE_IDA(gpio_ida); static dev_t gpio_devt; #define GPIO_DEV_MAX 256 /* 256 GPIO chip devices supported */ static int gpio_bus_match(struct device *dev, const struct device_driver *drv) { struct fwnode_handle *fwnode = dev_fwnode(dev); /* * Only match if the fwnode doesn't already have a proper struct device * created for it. */ if (fwnode && fwnode->dev != dev) return 0; return 1; } static const struct bus_type gpio_bus_type = { .name = "gpio", .match = gpio_bus_match, }; /* * Number of GPIOs to use for the fast path in set array */ #define FASTPATH_NGPIO CONFIG_GPIOLIB_FASTPATH_LIMIT static DEFINE_MUTEX(gpio_lookup_lock); static LIST_HEAD(gpio_lookup_list); static LIST_HEAD(gpio_devices); /* Protects the GPIO device list against concurrent modifications. */ static DEFINE_MUTEX(gpio_devices_lock); /* Ensures coherence during read-only accesses to the list of GPIO devices. */ DEFINE_STATIC_SRCU(gpio_devices_srcu); static DEFINE_MUTEX(gpio_machine_hogs_mutex); static LIST_HEAD(gpio_machine_hogs); const char *const gpio_suffixes[] = { "gpios", "gpio", NULL }; static void gpiochip_free_hogs(struct gpio_chip *gc); static int gpiochip_add_irqchip(struct gpio_chip *gc, struct lock_class_key *lock_key, struct lock_class_key *request_key); static void gpiochip_irqchip_remove(struct gpio_chip *gc); static int gpiochip_irqchip_init_hw(struct gpio_chip *gc); static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gc); static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc); static bool gpiolib_initialized; const char *gpiod_get_label(struct gpio_desc *desc) { struct gpio_desc_label *label; unsigned long flags; flags = READ_ONCE(desc->flags); label = srcu_dereference_check(desc->label, &desc->gdev->desc_srcu, srcu_read_lock_held(&desc->gdev->desc_srcu)); if (test_bit(FLAG_USED_AS_IRQ, &flags)) return label ? label->str : "interrupt"; if (!test_bit(FLAG_REQUESTED, &flags)) return NULL; return label ? label->str : NULL; } static void desc_free_label(struct rcu_head *rh) { kfree(container_of(rh, struct gpio_desc_label, rh)); } static int desc_set_label(struct gpio_desc *desc, const char *label) { struct gpio_desc_label *new = NULL, *old; if (label) { new = kzalloc(struct_size(new, str, strlen(label) + 1), GFP_KERNEL); if (!new) return -ENOMEM; strcpy(new->str, label); } old = rcu_replace_pointer(desc->label, new, 1); if (old) call_srcu(&desc->gdev->desc_srcu, &old->rh, desc_free_label); return 0; } /** * gpio_to_desc - Convert a GPIO number to its descriptor * @gpio: global GPIO number * * Returns: * The GPIO descriptor associated with the given GPIO, or %NULL if no GPIO * with the given number exists in the system. */ struct gpio_desc *gpio_to_desc(unsigned gpio) { struct gpio_device *gdev; scoped_guard(srcu, &gpio_devices_srcu) { list_for_each_entry_srcu(gdev, &gpio_devices, list, srcu_read_lock_held(&gpio_devices_srcu)) { if (gdev->base <= gpio && gdev->base + gdev->ngpio > gpio) return &gdev->descs[gpio - gdev->base]; } } return NULL; } EXPORT_SYMBOL_GPL(gpio_to_desc); /* This function is deprecated and will be removed soon, don't use. */ struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum) { return gpio_device_get_desc(gc->gpiodev, hwnum); } /** * gpio_device_get_desc() - get the GPIO descriptor corresponding to the given * hardware number for this GPIO device * @gdev: GPIO device to get the descriptor from * @hwnum: hardware number of the GPIO for this chip * * Returns: * A pointer to the GPIO descriptor or %EINVAL if no GPIO exists in the given * chip for the specified hardware number or %ENODEV if the underlying chip * already vanished. * * The reference count of struct gpio_device is *NOT* increased like when the * GPIO is being requested for exclusive usage. It's up to the caller to make * sure the GPIO device will stay alive together with the descriptor returned * by this function. */ struct gpio_desc * gpio_device_get_desc(struct gpio_device *gdev, unsigned int hwnum) { if (hwnum >= gdev->ngpio) return ERR_PTR(-EINVAL); return &gdev->descs[array_index_nospec(hwnum, gdev->ngpio)]; } EXPORT_SYMBOL_GPL(gpio_device_get_desc); /** * desc_to_gpio - convert a GPIO descriptor to the integer namespace * @desc: GPIO descriptor * * This should disappear in the future but is needed since we still * use GPIO numbers for error messages and sysfs nodes. * * Returns: * The global GPIO number for the GPIO specified by its descriptor. */ int desc_to_gpio(const struct gpio_desc *desc) { return desc->gdev->base + (desc - &desc->gdev->descs[0]); } EXPORT_SYMBOL_GPL(desc_to_gpio); /** * gpiod_to_chip - Return the GPIO chip to which a GPIO descriptor belongs * @desc: descriptor to return the chip of * * *DEPRECATED* * This function is unsafe and should not be used. Using the chip address * without taking the SRCU read lock may result in dereferencing a dangling * pointer. * * Returns: * Address of the GPIO chip backing this device. */ struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc) { if (!desc) return NULL; return gpio_device_get_chip(desc->gdev); } EXPORT_SYMBOL_GPL(gpiod_to_chip); /** * gpiod_to_gpio_device() - Return the GPIO device to which this descriptor * belongs. * @desc: Descriptor for which to return the GPIO device. * * This *DOES NOT* increase the reference count of the GPIO device as it's * expected that the descriptor is requested and the users already holds a * reference to the device. * * Returns: * Address of the GPIO device owning this descriptor. */ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc) { if (!desc) return NULL; return desc->gdev; } EXPORT_SYMBOL_GPL(gpiod_to_gpio_device); /** * gpio_device_get_base() - Get the base GPIO number allocated by this device * @gdev: GPIO device * * Returns: * First GPIO number in the global GPIO numberspace for this device. */ int gpio_device_get_base(struct gpio_device *gdev) { return gdev->base; } EXPORT_SYMBOL_GPL(gpio_device_get_base); /** * gpio_device_get_label() - Get the label of this GPIO device * @gdev: GPIO device * * Returns: * Pointer to the string containing the GPIO device label. The string's * lifetime is tied to that of the underlying GPIO device. */ const char *gpio_device_get_label(struct gpio_device *gdev) { return gdev->label; } EXPORT_SYMBOL(gpio_device_get_label); /** * gpio_device_get_chip() - Get the gpio_chip implementation of this GPIO device * @gdev: GPIO device * * Returns: * Address of the GPIO chip backing this device. * * *DEPRECATED* * Until we can get rid of all non-driver users of struct gpio_chip, we must * provide a way of retrieving the pointer to it from struct gpio_device. This * is *NOT* safe as the GPIO API is considered to be hot-unpluggable and the * chip can dissapear at any moment (unlike reference-counted struct * gpio_device). * * Use at your own risk. */ struct gpio_chip *gpio_device_get_chip(struct gpio_device *gdev) { return rcu_dereference_check(gdev->chip, 1); } EXPORT_SYMBOL_GPL(gpio_device_get_chip); /* dynamic allocation of GPIOs, e.g. on a hotplugged device */ static int gpiochip_find_base_unlocked(u16 ngpio) { unsigned int base = GPIO_DYNAMIC_BASE; struct gpio_device *gdev; list_for_each_entry_srcu(gdev, &gpio_devices, list, lockdep_is_held(&gpio_devices_lock)) { /* found a free space? */ if (gdev->base >= base + ngpio) break; /* nope, check the space right after the chip */ base = gdev->base + gdev->ngpio; if (base < GPIO_DYNAMIC_BASE) base = GPIO_DYNAMIC_BASE; if (base > GPIO_DYNAMIC_MAX - ngpio) break; } if (base <= GPIO_DYNAMIC_MAX - ngpio) { pr_debug("%s: found new base at %d\n", __func__, base); return base; } else { pr_err("%s: cannot find free range\n", __func__); return -ENOSPC; } } /** * gpiod_get_direction - return the current direction of a GPIO * @desc: GPIO to get the direction of * * Returns: * 0 for output, 1 for input, or an error code in case of error. * * This function may sleep if gpiod_cansleep() is true. */ int gpiod_get_direction(struct gpio_desc *desc) { unsigned long flags; unsigned int offset; int ret; /* * We cannot use VALIDATE_DESC() as we must not return 0 for a NULL * descriptor like we usually do. */ if (IS_ERR_OR_NULL(desc)) return -EINVAL; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; offset = gpio_chip_hwgpio(desc); flags = READ_ONCE(desc->flags); /* * Open drain emulation using input mode may incorrectly report * input here, fix that up. */ if (test_bit(FLAG_OPEN_DRAIN, &flags) && test_bit(FLAG_IS_OUT, &flags)) return 0; if (!guard.gc->get_direction) return -ENOTSUPP; ret = guard.gc->get_direction(guard.gc, offset); if (ret < 0) return ret; /* * GPIO_LINE_DIRECTION_IN or other positive, * otherwise GPIO_LINE_DIRECTION_OUT. */ if (ret > 0) ret = 1; assign_bit(FLAG_IS_OUT, &flags, !ret); WRITE_ONCE(desc->flags, flags); return ret; } EXPORT_SYMBOL_GPL(gpiod_get_direction); /* * Add a new chip to the global chips list, keeping the list of chips sorted * by range(means [base, base + ngpio - 1]) order. * * Returns: * -EBUSY if the new chip overlaps with some other chip's integer space. */ static int gpiodev_add_to_list_unlocked(struct gpio_device *gdev) { struct gpio_device *prev, *next; lockdep_assert_held(&gpio_devices_lock); if (list_empty(&gpio_devices)) { /* initial entry in list */ list_add_tail_rcu(&gdev->list, &gpio_devices); return 0; } next = list_first_entry(&gpio_devices, struct gpio_device, list); if (gdev->base + gdev->ngpio <= next->base) { /* add before first entry */ list_add_rcu(&gdev->list, &gpio_devices); return 0; } prev = list_last_entry(&gpio_devices, struct gpio_device, list); if (prev->base + prev->ngpio <= gdev->base) { /* add behind last entry */ list_add_tail_rcu(&gdev->list, &gpio_devices); return 0; } list_for_each_entry_safe(prev, next, &gpio_devices, list) { /* at the end of the list */ if (&next->list == &gpio_devices) break; /* add between prev and next */ if (prev->base + prev->ngpio <= gdev->base && gdev->base + gdev->ngpio <= next->base) { list_add_rcu(&gdev->list, &prev->list); return 0; } } synchronize_srcu(&gpio_devices_srcu); return -EBUSY; } /* * Convert a GPIO name to its descriptor * Note that there is no guarantee that GPIO names are globally unique! * Hence this function will return, if it exists, a reference to the first GPIO * line found that matches the given name. */ static struct gpio_desc *gpio_name_to_desc(const char * const name) { struct gpio_device *gdev; struct gpio_desc *desc; struct gpio_chip *gc; if (!name) return NULL; guard(srcu)(&gpio_devices_srcu); list_for_each_entry_srcu(gdev, &gpio_devices, list, srcu_read_lock_held(&gpio_devices_srcu)) { guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); if (!gc) continue; for_each_gpio_desc(gc, desc) { if (desc->name && !strcmp(desc->name, name)) return desc; } } return NULL; } /* * Take the names from gc->names and assign them to their GPIO descriptors. * Warn if a name is already used for a GPIO line on a different GPIO chip. * * Note that: * 1. Non-unique names are still accepted, * 2. Name collisions within the same GPIO chip are not reported. */ static void gpiochip_set_desc_names(struct gpio_chip *gc) { struct gpio_device *gdev = gc->gpiodev; int i; /* First check all names if they are unique */ for (i = 0; i != gc->ngpio; ++i) { struct gpio_desc *gpio; gpio = gpio_name_to_desc(gc->names[i]); if (gpio) dev_warn(&gdev->dev, "Detected name collision for GPIO name '%s'\n", gc->names[i]); } /* Then add all names to the GPIO descriptors */ for (i = 0; i != gc->ngpio; ++i) gdev->descs[i].name = gc->names[i]; } /* * gpiochip_set_names - Set GPIO line names using device properties * @chip: GPIO chip whose lines should be named, if possible * * Looks for device property "gpio-line-names" and if it exists assigns * GPIO line names for the chip. The memory allocated for the assigned * names belong to the underlying firmware node and should not be released * by the caller. */ static int gpiochip_set_names(struct gpio_chip *chip) { struct gpio_device *gdev = chip->gpiodev; struct device *dev = &gdev->dev; const char **names; int ret, i; int count; count = device_property_string_array_count(dev, "gpio-line-names"); if (count < 0) return 0; /* * When offset is set in the driver side we assume the driver internally * is using more than one gpiochip per the same device. We have to stop * setting friendly names if the specified ones with 'gpio-line-names' * are less than the offset in the device itself. This means all the * lines are not present for every single pin within all the internal * gpiochips. */ if (count <= chip->offset) { dev_warn(dev, "gpio-line-names too short (length %d), cannot map names for the gpiochip at offset %u\n", count, chip->offset); return 0; } names = kcalloc(count, sizeof(*names), GFP_KERNEL); if (!names) return -ENOMEM; ret = device_property_read_string_array(dev, "gpio-line-names", names, count); if (ret < 0) { dev_warn(dev, "failed to read GPIO line names\n"); kfree(names); return ret; } /* * When more that one gpiochip per device is used, 'count' can * contain at most number gpiochips x chip->ngpio. We have to * correctly distribute all defined lines taking into account * chip->offset as starting point from where we will assign * the names to pins from the 'names' array. Since property * 'gpio-line-names' cannot contains gaps, we have to be sure * we only assign those pins that really exists since chip->ngpio * can be different of the chip->offset. */ count = (count > chip->offset) ? count - chip->offset : count; if (count > chip->ngpio) count = chip->ngpio; for (i = 0; i < count; i++) { /* * Allow overriding "fixed" names provided by the GPIO * provider. The "fixed" names are more often than not * generic and less informative than the names given in * device properties. */ if (names[chip->offset + i] && names[chip->offset + i][0]) gdev->descs[i].name = names[chip->offset + i]; } kfree(names); return 0; } static unsigned long *gpiochip_allocate_mask(struct gpio_chip *gc) { unsigned long *p; p = bitmap_alloc(gc->ngpio, GFP_KERNEL); if (!p) return NULL; /* Assume by default all GPIOs are valid */ bitmap_fill(p, gc->ngpio); return p; } static void gpiochip_free_mask(unsigned long **p) { bitmap_free(*p); *p = NULL; } static unsigned int gpiochip_count_reserved_ranges(struct gpio_chip *gc) { struct device *dev = &gc->gpiodev->dev; int size; /* Format is "start, count, ..." */ size = device_property_count_u32(dev, "gpio-reserved-ranges"); if (size > 0 && size % 2 == 0) return size; return 0; } static int gpiochip_apply_reserved_ranges(struct gpio_chip *gc) { struct device *dev = &gc->gpiodev->dev; unsigned int size; u32 *ranges; int ret; size = gpiochip_count_reserved_ranges(gc); if (size == 0) return 0; ranges = kmalloc_array(size, sizeof(*ranges), GFP_KERNEL); if (!ranges) return -ENOMEM; ret = device_property_read_u32_array(dev, "gpio-reserved-ranges", ranges, size); if (ret) { kfree(ranges); return ret; } while (size) { u32 count = ranges[--size]; u32 start = ranges[--size]; if (start >= gc->ngpio || start + count > gc->ngpio) continue; bitmap_clear(gc->valid_mask, start, count); } kfree(ranges); return 0; } static int gpiochip_init_valid_mask(struct gpio_chip *gc) { int ret; if (!(gpiochip_count_reserved_ranges(gc) || gc->init_valid_mask)) return 0; gc->valid_mask = gpiochip_allocate_mask(gc); if (!gc->valid_mask) return -ENOMEM; ret = gpiochip_apply_reserved_ranges(gc); if (ret) return ret; if (gc->init_valid_mask) return gc->init_valid_mask(gc, gc->valid_mask, gc->ngpio); return 0; } static void gpiochip_free_valid_mask(struct gpio_chip *gc) { gpiochip_free_mask(&gc->valid_mask); } static int gpiochip_add_pin_ranges(struct gpio_chip *gc) { /* * Device Tree platforms are supposed to use "gpio-ranges" * property. This check ensures that the ->add_pin_ranges() * won't be called for them. */ if (device_property_present(&gc->gpiodev->dev, "gpio-ranges")) return 0; if (gc->add_pin_ranges) return gc->add_pin_ranges(gc); return 0; } bool gpiochip_line_is_valid(const struct gpio_chip *gc, unsigned int offset) { /* No mask means all valid */ if (likely(!gc->valid_mask)) return true; return test_bit(offset, gc->valid_mask); } EXPORT_SYMBOL_GPL(gpiochip_line_is_valid); static void gpiod_free_irqs(struct gpio_desc *desc) { int irq = gpiod_to_irq(desc); struct irq_desc *irqd = irq_to_desc(irq); void *cookie; for (;;) { /* * Make sure the action doesn't go away while we're * dereferencing it. Retrieve and store the cookie value. * If the irq is freed after we release the lock, that's * alright - the underlying maple tree lookup will return NULL * and nothing will happen in free_irq(). */ scoped_guard(mutex, &irqd->request_mutex) { if (!irq_desc_has_action(irqd)) return; cookie = irqd->action->dev_id; } free_irq(irq, cookie); } } /* * The chip is going away but there may be users who had requested interrupts * on its GPIO lines who have no idea about its removal and have no way of * being notified about it. We need to free any interrupts still in use here or * we'll leak memory and resources (like procfs files). */ static void gpiochip_free_remaining_irqs(struct gpio_chip *gc) { struct gpio_desc *desc; for_each_gpio_desc_with_flag(gc, desc, FLAG_USED_AS_IRQ) gpiod_free_irqs(desc); } static void gpiodev_release(struct device *dev) { struct gpio_device *gdev = to_gpio_device(dev); /* Call pending kfree()s for descriptor labels. */ synchronize_srcu(&gdev->desc_srcu); cleanup_srcu_struct(&gdev->desc_srcu); ida_free(&gpio_ida, gdev->id); kfree_const(gdev->label); kfree(gdev->descs); cleanup_srcu_struct(&gdev->srcu); kfree(gdev); } static const struct device_type gpio_dev_type = { .name = "gpio_chip", .release = gpiodev_release, }; #ifdef CONFIG_GPIO_CDEV #define gcdev_register(gdev, devt) gpiolib_cdev_register((gdev), (devt)) #define gcdev_unregister(gdev) gpiolib_cdev_unregister((gdev)) #else /* * gpiolib_cdev_register() indirectly calls device_add(), which is still * required even when cdev is not selected. */ #define gcdev_register(gdev, devt) device_add(&(gdev)->dev) #define gcdev_unregister(gdev) device_del(&(gdev)->dev) #endif static int gpiochip_setup_dev(struct gpio_device *gdev) { struct fwnode_handle *fwnode = dev_fwnode(&gdev->dev); int ret; device_initialize(&gdev->dev); /* * If fwnode doesn't belong to another device, it's safe to clear its * initialized flag. */ if (fwnode && !fwnode->dev) fwnode_dev_initialized(fwnode, false); ret = gcdev_register(gdev, gpio_devt); if (ret) return ret; ret = gpiochip_sysfs_register(gdev); if (ret) goto err_remove_device; dev_dbg(&gdev->dev, "registered GPIOs %u to %u on %s\n", gdev->base, gdev->base + gdev->ngpio - 1, gdev->label); return 0; err_remove_device: gcdev_unregister(gdev); return ret; } static void gpiochip_machine_hog(struct gpio_chip *gc, struct gpiod_hog *hog) { struct gpio_desc *desc; int rv; desc = gpiochip_get_desc(gc, hog->chip_hwnum); if (IS_ERR(desc)) { chip_err(gc, "%s: unable to get GPIO desc: %ld\n", __func__, PTR_ERR(desc)); return; } rv = gpiod_hog(desc, hog->line_name, hog->lflags, hog->dflags); if (rv) gpiod_err(desc, "%s: unable to hog GPIO line (%s:%u): %d\n", __func__, gc->label, hog->chip_hwnum, rv); } static void machine_gpiochip_add(struct gpio_chip *gc) { struct gpiod_hog *hog; mutex_lock(&gpio_machine_hogs_mutex); list_for_each_entry(hog, &gpio_machine_hogs, list) { if (!strcmp(gc->label, hog->chip_label)) gpiochip_machine_hog(gc, hog); } mutex_unlock(&gpio_machine_hogs_mutex); } static void gpiochip_setup_devs(void) { struct gpio_device *gdev; int ret; guard(srcu)(&gpio_devices_srcu); list_for_each_entry_srcu(gdev, &gpio_devices, list, srcu_read_lock_held(&gpio_devices_srcu)) { ret = gpiochip_setup_dev(gdev); if (ret) dev_err(&gdev->dev, "Failed to initialize gpio device (%d)\n", ret); } } static void gpiochip_set_data(struct gpio_chip *gc, void *data) { gc->gpiodev->data = data; } /** * gpiochip_get_data() - get per-subdriver data for the chip * @gc: GPIO chip * * Returns: * The per-subdriver data for the chip. */ void *gpiochip_get_data(struct gpio_chip *gc) { return gc->gpiodev->data; } EXPORT_SYMBOL_GPL(gpiochip_get_data); int gpiochip_get_ngpios(struct gpio_chip *gc, struct device *dev) { u32 ngpios = gc->ngpio; int ret; if (ngpios == 0) { ret = device_property_read_u32(dev, "ngpios", &ngpios); if (ret == -ENODATA) /* * -ENODATA means that there is no property found and * we want to issue the error message to the user. * Besides that, we want to return different error code * to state that supplied value is not valid. */ ngpios = 0; else if (ret) return ret; gc->ngpio = ngpios; } if (gc->ngpio == 0) { chip_err(gc, "tried to insert a GPIO chip with zero lines\n"); return -EINVAL; } if (gc->ngpio > FASTPATH_NGPIO) chip_warn(gc, "line cnt %u is greater than fast path cnt %u\n", gc->ngpio, FASTPATH_NGPIO); return 0; } EXPORT_SYMBOL_GPL(gpiochip_get_ngpios); int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, struct lock_class_key *lock_key, struct lock_class_key *request_key) { struct gpio_device *gdev; unsigned int desc_index; int base = 0; int ret = 0; /* * First: allocate and populate the internal stat container, and * set up the struct device. */ gdev = kzalloc(sizeof(*gdev), GFP_KERNEL); if (!gdev) return -ENOMEM; gdev->dev.type = &gpio_dev_type; gdev->dev.bus = &gpio_bus_type; gdev->dev.parent = gc->parent; rcu_assign_pointer(gdev->chip, gc); gc->gpiodev = gdev; gpiochip_set_data(gc, data); /* * If the calling driver did not initialize firmware node, * do it here using the parent device, if any. */ if (gc->fwnode) device_set_node(&gdev->dev, gc->fwnode); else if (gc->parent) device_set_node(&gdev->dev, dev_fwnode(gc->parent)); gdev->id = ida_alloc(&gpio_ida, GFP_KERNEL); if (gdev->id < 0) { ret = gdev->id; goto err_free_gdev; } ret = dev_set_name(&gdev->dev, GPIOCHIP_NAME "%d", gdev->id); if (ret) goto err_free_ida; if (gc->parent && gc->parent->driver) gdev->owner = gc->parent->driver->owner; else if (gc->owner) /* TODO: remove chip->owner */ gdev->owner = gc->owner; else gdev->owner = THIS_MODULE; ret = gpiochip_get_ngpios(gc, &gdev->dev); if (ret) goto err_free_dev_name; gdev->descs = kcalloc(gc->ngpio, sizeof(*gdev->descs), GFP_KERNEL); if (!gdev->descs) { ret = -ENOMEM; goto err_free_dev_name; } gdev->label = kstrdup_const(gc->label ?: "unknown", GFP_KERNEL); if (!gdev->label) { ret = -ENOMEM; goto err_free_descs; } gdev->ngpio = gc->ngpio; gdev->can_sleep = gc->can_sleep; scoped_guard(mutex, &gpio_devices_lock) { /* * TODO: this allocates a Linux GPIO number base in the global * GPIO numberspace for this chip. In the long run we want to * get *rid* of this numberspace and use only descriptors, but * it may be a pipe dream. It will not happen before we get rid * of the sysfs interface anyways. */ base = gc->base; if (base < 0) { base = gpiochip_find_base_unlocked(gc->ngpio); if (base < 0) { ret = base; base = 0; goto err_free_label; } /* * TODO: it should not be necessary to reflect the * assigned base outside of the GPIO subsystem. Go over * drivers and see if anyone makes use of this, else * drop this and assign a poison instead. */ gc->base = base; } else { dev_warn(&gdev->dev, "Static allocation of GPIO base is deprecated, use dynamic allocation.\n"); } gdev->base = base; ret = gpiodev_add_to_list_unlocked(gdev); if (ret) { chip_err(gc, "GPIO integer space overlap, cannot add chip\n"); goto err_free_label; } } ATOMIC_INIT_NOTIFIER_HEAD(&gdev->line_state_notifier); BLOCKING_INIT_NOTIFIER_HEAD(&gdev->device_notifier); ret = init_srcu_struct(&gdev->srcu); if (ret) goto err_remove_from_list; ret = init_srcu_struct(&gdev->desc_srcu); if (ret) goto err_cleanup_gdev_srcu; #ifdef CONFIG_PINCTRL INIT_LIST_HEAD(&gdev->pin_ranges); #endif if (gc->names) gpiochip_set_desc_names(gc); ret = gpiochip_set_names(gc); if (ret) goto err_cleanup_desc_srcu; ret = gpiochip_init_valid_mask(gc); if (ret) goto err_cleanup_desc_srcu; for (desc_index = 0; desc_index < gc->ngpio; desc_index++) { struct gpio_desc *desc = &gdev->descs[desc_index]; desc->gdev = gdev; if (gc->get_direction && gpiochip_line_is_valid(gc, desc_index)) { assign_bit(FLAG_IS_OUT, &desc->flags, !gc->get_direction(gc, desc_index)); } else { assign_bit(FLAG_IS_OUT, &desc->flags, !gc->direction_input); } } ret = of_gpiochip_add(gc); if (ret) goto err_free_valid_mask; ret = gpiochip_add_pin_ranges(gc); if (ret) goto err_remove_of_chip; acpi_gpiochip_add(gc); machine_gpiochip_add(gc); ret = gpiochip_irqchip_init_valid_mask(gc); if (ret) goto err_free_hogs; ret = gpiochip_irqchip_init_hw(gc); if (ret) goto err_remove_irqchip_mask; ret = gpiochip_add_irqchip(gc, lock_key, request_key); if (ret) goto err_remove_irqchip_mask; /* * By first adding the chardev, and then adding the device, * we get a device node entry in sysfs under * /sys/bus/gpio/devices/gpiochipN/dev that can be used for * coldplug of device nodes and other udev business. * We can do this only if gpiolib has been initialized. * Otherwise, defer until later. */ if (gpiolib_initialized) { ret = gpiochip_setup_dev(gdev); if (ret) goto err_remove_irqchip; } return 0; err_remove_irqchip: gpiochip_irqchip_remove(gc); err_remove_irqchip_mask: gpiochip_irqchip_free_valid_mask(gc); err_free_hogs: gpiochip_free_hogs(gc); acpi_gpiochip_remove(gc); gpiochip_remove_pin_ranges(gc); err_remove_of_chip: of_gpiochip_remove(gc); err_free_valid_mask: gpiochip_free_valid_mask(gc); err_cleanup_desc_srcu: cleanup_srcu_struct(&gdev->desc_srcu); err_cleanup_gdev_srcu: cleanup_srcu_struct(&gdev->srcu); err_remove_from_list: scoped_guard(mutex, &gpio_devices_lock) list_del_rcu(&gdev->list); synchronize_srcu(&gpio_devices_srcu); if (gdev->dev.release) { /* release() has been registered by gpiochip_setup_dev() */ gpio_device_put(gdev); goto err_print_message; } err_free_label: kfree_const(gdev->label); err_free_descs: kfree(gdev->descs); err_free_dev_name: kfree(dev_name(&gdev->dev)); err_free_ida: ida_free(&gpio_ida, gdev->id); err_free_gdev: kfree(gdev); err_print_message: /* failures here can mean systems won't boot... */ if (ret != -EPROBE_DEFER) { pr_err("%s: GPIOs %d..%d (%s) failed to register, %d\n", __func__, base, base + (int)gc->ngpio - 1, gc->label ? : "generic", ret); } return ret; } EXPORT_SYMBOL_GPL(gpiochip_add_data_with_key); /** * gpiochip_remove() - unregister a gpio_chip * @gc: the chip to unregister * * A gpio_chip with any GPIOs still requested may not be removed. */ void gpiochip_remove(struct gpio_chip *gc) { struct gpio_device *gdev = gc->gpiodev; /* FIXME: should the legacy sysfs handling be moved to gpio_device? */ gpiochip_sysfs_unregister(gdev); gpiochip_free_hogs(gc); gpiochip_free_remaining_irqs(gc); scoped_guard(mutex, &gpio_devices_lock) list_del_rcu(&gdev->list); synchronize_srcu(&gpio_devices_srcu); /* Numb the device, cancelling all outstanding operations */ rcu_assign_pointer(gdev->chip, NULL); synchronize_srcu(&gdev->srcu); gpiochip_irqchip_remove(gc); acpi_gpiochip_remove(gc); of_gpiochip_remove(gc); gpiochip_remove_pin_ranges(gc); gpiochip_free_valid_mask(gc); /* * We accept no more calls into the driver from this point, so * NULL the driver data pointer. */ gpiochip_set_data(gc, NULL); /* * The gpiochip side puts its use of the device to rest here: * if there are no userspace clients, the chardev and device will * be removed, else it will be dangling until the last user is * gone. */ gcdev_unregister(gdev); gpio_device_put(gdev); } EXPORT_SYMBOL_GPL(gpiochip_remove); /** * gpio_device_find() - find a specific GPIO device * @data: data to pass to match function * @match: Callback function to check gpio_chip * * Returns: * New reference to struct gpio_device. * * Similar to bus_find_device(). It returns a reference to a gpio_device as * determined by a user supplied @match callback. The callback should return * 0 if the device doesn't match and non-zero if it does. If the callback * returns non-zero, this function will return to the caller and not iterate * over any more gpio_devices. * * The callback takes the GPIO chip structure as argument. During the execution * of the callback function the chip is protected from being freed. TODO: This * actually has yet to be implemented. * * If the function returns non-NULL, the returned reference must be freed by * the caller using gpio_device_put(). */ struct gpio_device *gpio_device_find(const void *data, int (*match)(struct gpio_chip *gc, const void *data)) { struct gpio_device *gdev; struct gpio_chip *gc; might_sleep(); guard(srcu)(&gpio_devices_srcu); list_for_each_entry_srcu(gdev, &gpio_devices, list, srcu_read_lock_held(&gpio_devices_srcu)) { if (!device_is_registered(&gdev->dev)) continue; guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); if (gc && match(gc, data)) return gpio_device_get(gdev); } return NULL; } EXPORT_SYMBOL_GPL(gpio_device_find); static int gpio_chip_match_by_label(struct gpio_chip *gc, const void *label) { return gc->label && !strcmp(gc->label, label); } /** * gpio_device_find_by_label() - wrapper around gpio_device_find() finding the * GPIO device by its backing chip's label * @label: Label to lookup * * Returns: * Reference to the GPIO device or NULL. Reference must be released with * gpio_device_put(). */ struct gpio_device *gpio_device_find_by_label(const char *label) { return gpio_device_find((void *)label, gpio_chip_match_by_label); } EXPORT_SYMBOL_GPL(gpio_device_find_by_label); static int gpio_chip_match_by_fwnode(struct gpio_chip *gc, const void *fwnode) { return device_match_fwnode(&gc->gpiodev->dev, fwnode); } /** * gpio_device_find_by_fwnode() - wrapper around gpio_device_find() finding * the GPIO device by its fwnode * @fwnode: Firmware node to lookup * * Returns: * Reference to the GPIO device or NULL. Reference must be released with * gpio_device_put(). */ struct gpio_device *gpio_device_find_by_fwnode(const struct fwnode_handle *fwnode) { return gpio_device_find((void *)fwnode, gpio_chip_match_by_fwnode); } EXPORT_SYMBOL_GPL(gpio_device_find_by_fwnode); /** * gpio_device_get() - Increase the reference count of this GPIO device * @gdev: GPIO device to increase the refcount for * * Returns: * Pointer to @gdev. */ struct gpio_device *gpio_device_get(struct gpio_device *gdev) { return to_gpio_device(get_device(&gdev->dev)); } EXPORT_SYMBOL_GPL(gpio_device_get); /** * gpio_device_put() - Decrease the reference count of this GPIO device and * possibly free all resources associated with it. * @gdev: GPIO device to decrease the reference count for */ void gpio_device_put(struct gpio_device *gdev) { put_device(&gdev->dev); } EXPORT_SYMBOL_GPL(gpio_device_put); /** * gpio_device_to_device() - Retrieve the address of the underlying struct * device. * @gdev: GPIO device for which to return the address. * * This does not increase the reference count of the GPIO device nor the * underlying struct device. * * Returns: * Address of struct device backing this GPIO device. */ struct device *gpio_device_to_device(struct gpio_device *gdev) { return &gdev->dev; } EXPORT_SYMBOL_GPL(gpio_device_to_device); #ifdef CONFIG_GPIOLIB_IRQCHIP /* * The following is irqchip helper code for gpiochips. */ static int gpiochip_irqchip_init_hw(struct gpio_chip *gc) { struct gpio_irq_chip *girq = &gc->irq; if (!girq->init_hw) return 0; return girq->init_hw(gc); } static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gc) { struct gpio_irq_chip *girq = &gc->irq; if (!girq->init_valid_mask) return 0; girq->valid_mask = gpiochip_allocate_mask(gc); if (!girq->valid_mask) return -ENOMEM; girq->init_valid_mask(gc, girq->valid_mask, gc->ngpio); return 0; } static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc) { gpiochip_free_mask(&gc->irq.valid_mask); } static bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc, unsigned int offset) { if (!gpiochip_line_is_valid(gc, offset)) return false; /* No mask means all valid */ if (likely(!gc->irq.valid_mask)) return true; return test_bit(offset, gc->irq.valid_mask); } #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /** * gpiochip_set_hierarchical_irqchip() - connects a hierarchical irqchip * to a gpiochip * @gc: the gpiochip to set the irqchip hierarchical handler to * @irqchip: the irqchip to handle this level of the hierarchy, the interrupt * will then percolate up to the parent */ static void gpiochip_set_hierarchical_irqchip(struct gpio_chip *gc, struct irq_chip *irqchip) { /* DT will deal with mapping each IRQ as we go along */ if (is_of_node(gc->irq.fwnode)) return; /* * This is for legacy and boardfile "irqchip" fwnodes: allocate * irqs upfront instead of dynamically since we don't have the * dynamic type of allocation that hardware description languages * provide. Once all GPIO drivers using board files are gone from * the kernel we can delete this code, but for a transitional period * it is necessary to keep this around. */ if (is_fwnode_irqchip(gc->irq.fwnode)) { int i; int ret; for (i = 0; i < gc->ngpio; i++) { struct irq_fwspec fwspec; unsigned int parent_hwirq; unsigned int parent_type; struct gpio_irq_chip *girq = &gc->irq; /* * We call the child to parent translation function * only to check if the child IRQ is valid or not. * Just pick the rising edge type here as that is what * we likely need to support. */ ret = girq->child_to_parent_hwirq(gc, i, IRQ_TYPE_EDGE_RISING, &parent_hwirq, &parent_type); if (ret) { chip_err(gc, "skip set-up on hwirq %d\n", i); continue; } fwspec.fwnode = gc->irq.fwnode; /* This is the hwirq for the GPIO line side of things */ fwspec.param[0] = girq->child_offset_to_irq(gc, i); /* Just pick something */ fwspec.param[1] = IRQ_TYPE_EDGE_RISING; fwspec.param_count = 2; ret = irq_domain_alloc_irqs(gc->irq.domain, 1, NUMA_NO_NODE, &fwspec); if (ret < 0) { chip_err(gc, "can not allocate irq for GPIO line %d parent hwirq %d in hierarchy domain: %d\n", i, parent_hwirq, ret); } } } chip_err(gc, "%s unknown fwnode type proceed anyway\n", __func__); return; } static int gpiochip_hierarchy_irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *hwirq, unsigned int *type) { /* We support standard DT translation */ if (is_of_node(fwspec->fwnode) && fwspec->param_count == 2) { return irq_domain_translate_twocell(d, fwspec, hwirq, type); } /* This is for board files and others not using DT */ if (is_fwnode_irqchip(fwspec->fwnode)) { int ret; ret = irq_domain_translate_twocell(d, fwspec, hwirq, type); if (ret) return ret; WARN_ON(*type == IRQ_TYPE_NONE); return 0; } return -EINVAL; } static int gpiochip_hierarchy_irq_domain_alloc(struct irq_domain *d, unsigned int irq, unsigned int nr_irqs, void *data) { struct gpio_chip *gc = d->host_data; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; struct irq_fwspec *fwspec = data; union gpio_irq_fwspec gpio_parent_fwspec = {}; unsigned int parent_hwirq; unsigned int parent_type; struct gpio_irq_chip *girq = &gc->irq; int ret; /* * The nr_irqs parameter is always one except for PCI multi-MSI * so this should not happen. */ WARN_ON(nr_irqs != 1); ret = gc->irq.child_irq_domain_ops.translate(d, fwspec, &hwirq, &type); if (ret) return ret; chip_dbg(gc, "allocate IRQ %d, hwirq %lu\n", irq, hwirq); ret = girq->child_to_parent_hwirq(gc, hwirq, type, &parent_hwirq, &parent_type); if (ret) { chip_err(gc, "can't look up hwirq %lu\n", hwirq); return ret; } chip_dbg(gc, "found parent hwirq %u\n", parent_hwirq); /* * We set handle_bad_irq because the .set_type() should * always be invoked and set the right type of handler. */ irq_domain_set_info(d, irq, hwirq, gc->irq.chip, gc, girq->handler, NULL, NULL); irq_set_probe(irq); /* This parent only handles asserted level IRQs */ ret = girq->populate_parent_alloc_arg(gc, &gpio_parent_fwspec, parent_hwirq, parent_type); if (ret) return ret; chip_dbg(gc, "alloc_irqs_parent for %d parent hwirq %d\n", irq, parent_hwirq); irq_set_lockdep_class(irq, gc->irq.lock_key, gc->irq.request_key); ret = irq_domain_alloc_irqs_parent(d, irq, 1, &gpio_parent_fwspec); /* * If the parent irqdomain is msi, the interrupts have already * been allocated, so the EEXIST is good. */ if (irq_domain_is_msi(d->parent) && (ret == -EEXIST)) ret = 0; if (ret) chip_err(gc, "failed to allocate parent hwirq %d for hwirq %lu\n", parent_hwirq, hwirq); return ret; } static unsigned int gpiochip_child_offset_to_irq_noop(struct gpio_chip *gc, unsigned int offset) { return offset; } /** * gpiochip_irq_domain_activate() - Lock a GPIO to be used as an IRQ * @domain: The IRQ domain used by this IRQ chip * @data: Outermost irq_data associated with the IRQ * @reserve: If set, only reserve an interrupt vector instead of assigning one * * This function is a wrapper that calls gpiochip_lock_as_irq() and is to be * used as the activate function for the &struct irq_domain_ops. The host_data * for the IRQ domain must be the &struct gpio_chip. * * Returns: * 0 on success, or negative errno on failure. */ static int gpiochip_irq_domain_activate(struct irq_domain *domain, struct irq_data *data, bool reserve) { struct gpio_chip *gc = domain->host_data; unsigned int hwirq = irqd_to_hwirq(data); return gpiochip_lock_as_irq(gc, hwirq); } /** * gpiochip_irq_domain_deactivate() - Unlock a GPIO used as an IRQ * @domain: The IRQ domain used by this IRQ chip * @data: Outermost irq_data associated with the IRQ * * This function is a wrapper that will call gpiochip_unlock_as_irq() and is to * be used as the deactivate function for the &struct irq_domain_ops. The * host_data for the IRQ domain must be the &struct gpio_chip. */ static void gpiochip_irq_domain_deactivate(struct irq_domain *domain, struct irq_data *data) { struct gpio_chip *gc = domain->host_data; unsigned int hwirq = irqd_to_hwirq(data); return gpiochip_unlock_as_irq(gc, hwirq); } static void gpiochip_hierarchy_setup_domain_ops(struct irq_domain_ops *ops) { ops->activate = gpiochip_irq_domain_activate; ops->deactivate = gpiochip_irq_domain_deactivate; ops->alloc = gpiochip_hierarchy_irq_domain_alloc; /* * We only allow overriding the translate() and free() functions for * hierarchical chips, and this should only be done if the user * really need something other than 1:1 translation for translate() * callback and free if user wants to free up any resources which * were allocated during callbacks, for example populate_parent_alloc_arg. */ if (!ops->translate) ops->translate = gpiochip_hierarchy_irq_domain_translate; if (!ops->free) ops->free = irq_domain_free_irqs_common; } static struct irq_domain *gpiochip_hierarchy_create_domain(struct gpio_chip *gc) { struct irq_domain *domain; if (!gc->irq.child_to_parent_hwirq || !gc->irq.fwnode) { chip_err(gc, "missing irqdomain vital data\n"); return ERR_PTR(-EINVAL); } if (!gc->irq.child_offset_to_irq) gc->irq.child_offset_to_irq = gpiochip_child_offset_to_irq_noop; if (!gc->irq.populate_parent_alloc_arg) gc->irq.populate_parent_alloc_arg = gpiochip_populate_parent_fwspec_twocell; gpiochip_hierarchy_setup_domain_ops(&gc->irq.child_irq_domain_ops); domain = irq_domain_create_hierarchy( gc->irq.parent_domain, 0, gc->ngpio, gc->irq.fwnode, &gc->irq.child_irq_domain_ops, gc); if (!domain) return ERR_PTR(-ENOMEM); gpiochip_set_hierarchical_irqchip(gc, gc->irq.chip); return domain; } static bool gpiochip_hierarchy_is_hierarchical(struct gpio_chip *gc) { return !!gc->irq.parent_domain; } int gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc, union gpio_irq_fwspec *gfwspec, unsigned int parent_hwirq, unsigned int parent_type) { struct irq_fwspec *fwspec = &gfwspec->fwspec; fwspec->fwnode = gc->irq.parent_domain->fwnode; fwspec->param_count = 2; fwspec->param[0] = parent_hwirq; fwspec->param[1] = parent_type; return 0; } EXPORT_SYMBOL_GPL(gpiochip_populate_parent_fwspec_twocell); int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, union gpio_irq_fwspec *gfwspec, unsigned int parent_hwirq, unsigned int parent_type) { struct irq_fwspec *fwspec = &gfwspec->fwspec; fwspec->fwnode = gc->irq.parent_domain->fwnode; fwspec->param_count = 4; fwspec->param[0] = 0; fwspec->param[1] = parent_hwirq; fwspec->param[2] = 0; fwspec->param[3] = parent_type; return 0; } EXPORT_SYMBOL_GPL(gpiochip_populate_parent_fwspec_fourcell); #else static struct irq_domain *gpiochip_hierarchy_create_domain(struct gpio_chip *gc) { return ERR_PTR(-EINVAL); } static bool gpiochip_hierarchy_is_hierarchical(struct gpio_chip *gc) { return false; } #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /** * gpiochip_irq_map() - maps an IRQ into a GPIO irqchip * @d: the irqdomain used by this irqchip * @irq: the global irq number used by this GPIO irqchip irq * @hwirq: the local IRQ/GPIO line offset on this gpiochip * * This function will set up the mapping for a certain IRQ line on a * gpiochip by assigning the gpiochip as chip data, and using the irqchip * stored inside the gpiochip. * * Returns: * 0 on success, or negative errno on failure. */ static int gpiochip_irq_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) { struct gpio_chip *gc = d->host_data; int ret = 0; if (!gpiochip_irqchip_irq_valid(gc, hwirq)) return -ENXIO; irq_set_chip_data(irq, gc); /* * This lock class tells lockdep that GPIO irqs are in a different * category than their parents, so it won't report false recursion. */ irq_set_lockdep_class(irq, gc->irq.lock_key, gc->irq.request_key); irq_set_chip_and_handler(irq, gc->irq.chip, gc->irq.handler); /* Chips that use nested thread handlers have them marked */ if (gc->irq.threaded) irq_set_nested_thread(irq, 1); irq_set_noprobe(irq); if (gc->irq.num_parents == 1) ret = irq_set_parent(irq, gc->irq.parents[0]); else if (gc->irq.map) ret = irq_set_parent(irq, gc->irq.map[hwirq]); if (ret < 0) return ret; /* * No set-up of the hardware will happen if IRQ_TYPE_NONE * is passed as default type. */ if (gc->irq.default_type != IRQ_TYPE_NONE) irq_set_irq_type(irq, gc->irq.default_type); return 0; } static void gpiochip_irq_unmap(struct irq_domain *d, unsigned int irq) { struct gpio_chip *gc = d->host_data; if (gc->irq.threaded) irq_set_nested_thread(irq, 0); irq_set_chip_and_handler(irq, NULL, NULL); irq_set_chip_data(irq, NULL); } static const struct irq_domain_ops gpiochip_domain_ops = { .map = gpiochip_irq_map, .unmap = gpiochip_irq_unmap, /* Virtually all GPIO irqchips are twocell:ed */ .xlate = irq_domain_xlate_twocell, }; static struct irq_domain *gpiochip_simple_create_domain(struct gpio_chip *gc) { struct fwnode_handle *fwnode = dev_fwnode(&gc->gpiodev->dev); struct irq_domain *domain; domain = irq_domain_create_simple(fwnode, gc->ngpio, gc->irq.first, &gpiochip_domain_ops, gc); if (!domain) return ERR_PTR(-EINVAL); return domain; } static int gpiochip_to_irq(struct gpio_chip *gc, unsigned int offset) { struct irq_domain *domain = gc->irq.domain; #ifdef CONFIG_GPIOLIB_IRQCHIP /* * Avoid race condition with other code, which tries to lookup * an IRQ before the irqchip has been properly registered, * i.e. while gpiochip is still being brought up. */ if (!gc->irq.initialized) return -EPROBE_DEFER; #endif if (!gpiochip_irqchip_irq_valid(gc, offset)) return -ENXIO; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (irq_domain_is_hierarchy(domain)) { struct irq_fwspec spec; spec.fwnode = domain->fwnode; spec.param_count = 2; spec.param[0] = gc->irq.child_offset_to_irq(gc, offset); spec.param[1] = IRQ_TYPE_NONE; return irq_create_fwspec_mapping(&spec); } #endif return irq_create_mapping(domain, offset); } int gpiochip_irq_reqres(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); return gpiochip_reqres_irq(gc, hwirq); } EXPORT_SYMBOL(gpiochip_irq_reqres); void gpiochip_irq_relres(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gpiochip_relres_irq(gc, hwirq); } EXPORT_SYMBOL(gpiochip_irq_relres); static void gpiochip_irq_mask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); if (gc->irq.irq_mask) gc->irq.irq_mask(d); gpiochip_disable_irq(gc, hwirq); } static void gpiochip_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gpiochip_enable_irq(gc, hwirq); if (gc->irq.irq_unmask) gc->irq.irq_unmask(d); } static void gpiochip_irq_enable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gpiochip_enable_irq(gc, hwirq); gc->irq.irq_enable(d); } static void gpiochip_irq_disable(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); unsigned int hwirq = irqd_to_hwirq(d); gc->irq.irq_disable(d); gpiochip_disable_irq(gc, hwirq); } static void gpiochip_set_irq_hooks(struct gpio_chip *gc) { struct irq_chip *irqchip = gc->irq.chip; if (irqchip->flags & IRQCHIP_IMMUTABLE) return; chip_warn(gc, "not an immutable chip, please consider fixing it!\n"); if (!irqchip->irq_request_resources && !irqchip->irq_release_resources) { irqchip->irq_request_resources = gpiochip_irq_reqres; irqchip->irq_release_resources = gpiochip_irq_relres; } if (WARN_ON(gc->irq.irq_enable)) return; /* Check if the irqchip already has this hook... */ if (irqchip->irq_enable == gpiochip_irq_enable || irqchip->irq_mask == gpiochip_irq_mask) { /* * ...and if so, give a gentle warning that this is bad * practice. */ chip_info(gc, "detected irqchip that is shared with multiple gpiochips: please fix the driver.\n"); return; } if (irqchip->irq_disable) { gc->irq.irq_disable = irqchip->irq_disable; irqchip->irq_disable = gpiochip_irq_disable; } else { gc->irq.irq_mask = irqchip->irq_mask; irqchip->irq_mask = gpiochip_irq_mask; } if (irqchip->irq_enable) { gc->irq.irq_enable = irqchip->irq_enable; irqchip->irq_enable = gpiochip_irq_enable; } else { gc->irq.irq_unmask = irqchip->irq_unmask; irqchip->irq_unmask = gpiochip_irq_unmask; } } static int gpiochip_irqchip_add_allocated_domain(struct gpio_chip *gc, struct irq_domain *domain, bool allocated_externally) { if (!domain) return -EINVAL; if (gc->to_irq) chip_warn(gc, "to_irq is redefined in %s and you shouldn't rely on it\n", __func__); gc->to_irq = gpiochip_to_irq; gc->irq.domain = domain; gc->irq.domain_is_allocated_externally = allocated_externally; /* * Using barrier() here to prevent compiler from reordering * gc->irq.initialized before adding irqdomain. */ barrier(); gc->irq.initialized = true; return 0; } /** * gpiochip_add_irqchip() - adds an IRQ chip to a GPIO chip * @gc: the GPIO chip to add the IRQ chip to * @lock_key: lockdep class for IRQ lock * @request_key: lockdep class for IRQ request * * Returns: * 0 on success, or a negative errno on failure. */ static int gpiochip_add_irqchip(struct gpio_chip *gc, struct lock_class_key *lock_key, struct lock_class_key *request_key) { struct fwnode_handle *fwnode = dev_fwnode(&gc->gpiodev->dev); struct irq_chip *irqchip = gc->irq.chip; struct irq_domain *domain; unsigned int type; unsigned int i; int ret; if (!irqchip) return 0; if (gc->irq.parent_handler && gc->can_sleep) { chip_err(gc, "you cannot have chained interrupts on a chip that may sleep\n"); return -EINVAL; } type = gc->irq.default_type; /* * Specifying a default trigger is a terrible idea if DT or ACPI is * used to configure the interrupts, as you may end up with * conflicting triggers. Tell the user, and reset to NONE. */ if (WARN(fwnode && type != IRQ_TYPE_NONE, "%pfw: Ignoring %u default trigger\n", fwnode, type)) type = IRQ_TYPE_NONE; gc->irq.default_type = type; gc->irq.lock_key = lock_key; gc->irq.request_key = request_key; /* If a parent irqdomain is provided, let's build a hierarchy */ if (gpiochip_hierarchy_is_hierarchical(gc)) { domain = gpiochip_hierarchy_create_domain(gc); } else { domain = gpiochip_simple_create_domain(gc); } if (IS_ERR(domain)) return PTR_ERR(domain); if (gc->irq.parent_handler) { for (i = 0; i < gc->irq.num_parents; i++) { void *data; if (gc->irq.per_parent_data) data = gc->irq.parent_handler_data_array[i]; else data = gc->irq.parent_handler_data ?: gc; /* * The parent IRQ chip is already using the chip_data * for this IRQ chip, so our callbacks simply use the * handler_data. */ irq_set_chained_handler_and_data(gc->irq.parents[i], gc->irq.parent_handler, data); } } gpiochip_set_irq_hooks(gc); ret = gpiochip_irqchip_add_allocated_domain(gc, domain, false); if (ret) return ret; acpi_gpiochip_request_interrupts(gc); return 0; } /** * gpiochip_irqchip_remove() - removes an irqchip added to a gpiochip * @gc: the gpiochip to remove the irqchip from * * This is called only from gpiochip_remove() */ static void gpiochip_irqchip_remove(struct gpio_chip *gc) { struct irq_chip *irqchip = gc->irq.chip; unsigned int offset; acpi_gpiochip_free_interrupts(gc); if (irqchip && gc->irq.parent_handler) { struct gpio_irq_chip *irq = &gc->irq; unsigned int i; for (i = 0; i < irq->num_parents; i++) irq_set_chained_handler_and_data(irq->parents[i], NULL, NULL); } /* Remove all IRQ mappings and delete the domain */ if (!gc->irq.domain_is_allocated_externally && gc->irq.domain) { unsigned int irq; for (offset = 0; offset < gc->ngpio; offset++) { if (!gpiochip_irqchip_irq_valid(gc, offset)) continue; irq = irq_find_mapping(gc->irq.domain, offset); irq_dispose_mapping(irq); } irq_domain_remove(gc->irq.domain); } if (irqchip && !(irqchip->flags & IRQCHIP_IMMUTABLE)) { if (irqchip->irq_request_resources == gpiochip_irq_reqres) { irqchip->irq_request_resources = NULL; irqchip->irq_release_resources = NULL; } if (irqchip->irq_enable == gpiochip_irq_enable) { irqchip->irq_enable = gc->irq.irq_enable; irqchip->irq_disable = gc->irq.irq_disable; } } gc->irq.irq_enable = NULL; gc->irq.irq_disable = NULL; gc->irq.chip = NULL; gpiochip_irqchip_free_valid_mask(gc); } /** * gpiochip_irqchip_add_domain() - adds an irqdomain to a gpiochip * @gc: the gpiochip to add the irqchip to * @domain: the irqdomain to add to the gpiochip * * This function adds an IRQ domain to the gpiochip. * * Returns: * 0 on success, or negative errno on failure. */ int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain) { return gpiochip_irqchip_add_allocated_domain(gc, domain, true); } EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_domain); #else /* CONFIG_GPIOLIB_IRQCHIP */ static inline int gpiochip_add_irqchip(struct gpio_chip *gc, struct lock_class_key *lock_key, struct lock_class_key *request_key) { return 0; } static void gpiochip_irqchip_remove(struct gpio_chip *gc) {} static inline int gpiochip_irqchip_init_hw(struct gpio_chip *gc) { return 0; } static inline int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gc) { return 0; } static inline void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc) { } #endif /* CONFIG_GPIOLIB_IRQCHIP */ /** * gpiochip_generic_request() - request the gpio function for a pin * @gc: the gpiochip owning the GPIO * @offset: the offset of the GPIO to request for GPIO function * * Returns: * 0 on success, or negative errno on failure. */ int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset) { #ifdef CONFIG_PINCTRL if (list_empty(&gc->gpiodev->pin_ranges)) return 0; #endif return pinctrl_gpio_request(gc, offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_request); /** * gpiochip_generic_free() - free the gpio function from a pin * @gc: the gpiochip to request the gpio function for * @offset: the offset of the GPIO to free from GPIO function */ void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset) { #ifdef CONFIG_PINCTRL if (list_empty(&gc->gpiodev->pin_ranges)) return; #endif pinctrl_gpio_free(gc, offset); } EXPORT_SYMBOL_GPL(gpiochip_generic_free); /** * gpiochip_generic_config() - apply configuration for a pin * @gc: the gpiochip owning the GPIO * @offset: the offset of the GPIO to apply the configuration * @config: the configuration to be applied * * Returns: * 0 on success, or negative errno on failure. */ int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset, unsigned long config) { #ifdef CONFIG_PINCTRL if (list_empty(&gc->gpiodev->pin_ranges)) return -ENOTSUPP; #endif return pinctrl_gpio_set_config(gc, offset, config); } EXPORT_SYMBOL_GPL(gpiochip_generic_config); #ifdef CONFIG_PINCTRL /** * gpiochip_add_pingroup_range() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pctldev: the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_group: name of the pin group inside the pin controller * * Calling this function directly from a DeviceTree-supported * pinctrl driver is DEPRECATED. Please see Section 2.1 of * Documentation/devicetree/bindings/gpio/gpio.txt on how to * bind pinctrl and gpio drivers via the "gpio-ranges" property. * * Returns: * 0 on success, or negative errno on failure. */ int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, unsigned int gpio_offset, const char *pin_group) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; int ret; pin_range = kzalloc(sizeof(*pin_range), GFP_KERNEL); if (!pin_range) { chip_err(gc, "failed to allocate pin ranges\n"); return -ENOMEM; } /* Use local offset as range ID */ pin_range->range.id = gpio_offset; pin_range->range.gc = gc; pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->pctldev = pctldev; ret = pinctrl_get_group_pins(pctldev, pin_group, &pin_range->range.pins, &pin_range->range.npins); if (ret < 0) { kfree(pin_range); return ret; } pinctrl_add_gpio_range(pctldev, &pin_range->range); chip_dbg(gc, "created GPIO range %d->%d ==> %s PINGRP %s\n", gpio_offset, gpio_offset + pin_range->range.npins - 1, pinctrl_dev_get_devname(pctldev), pin_group); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); /** * gpiochip_add_pin_range() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pinctl_name: the dev_name() of the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_offset: the start offset in the pin controller number space * @npins: the number of pins from the offset of each pin space (GPIO and * pin controller) to accumulate in this range * * Calling this function directly from a DeviceTree-supported * pinctrl driver is DEPRECATED. Please see Section 2.1 of * Documentation/devicetree/bindings/gpio/gpio.txt on how to * bind pinctrl and gpio drivers via the "gpio-ranges" property. * * Returns: * 0 on success, or a negative errno on failure. */ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, unsigned int npins) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; int ret; pin_range = kzalloc(sizeof(*pin_range), GFP_KERNEL); if (!pin_range) { chip_err(gc, "failed to allocate pin ranges\n"); return -ENOMEM; } /* Use local offset as range ID */ pin_range->range.id = gpio_offset; pin_range->range.gc = gc; pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->range.pin_base = pin_offset; pin_range->range.npins = npins; pin_range->pctldev = pinctrl_find_and_add_gpio_range(pinctl_name, &pin_range->range); if (IS_ERR(pin_range->pctldev)) { ret = PTR_ERR(pin_range->pctldev); chip_err(gc, "could not create pin range\n"); kfree(pin_range); return ret; } chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", gpio_offset, gpio_offset + npins - 1, pinctl_name, pin_offset, pin_offset + npins - 1); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } EXPORT_SYMBOL_GPL(gpiochip_add_pin_range); /** * gpiochip_remove_pin_ranges() - remove all the GPIO <-> pin mappings * @gc: the chip to remove all the mappings for */ void gpiochip_remove_pin_ranges(struct gpio_chip *gc) { struct gpio_pin_range *pin_range, *tmp; struct gpio_device *gdev = gc->gpiodev; list_for_each_entry_safe(pin_range, tmp, &gdev->pin_ranges, node) { list_del(&pin_range->node); pinctrl_remove_gpio_range(pin_range->pctldev, &pin_range->range); kfree(pin_range); } } EXPORT_SYMBOL_GPL(gpiochip_remove_pin_ranges); #endif /* CONFIG_PINCTRL */ /* These "optional" allocation calls help prevent drivers from stomping * on each other, and help provide better diagnostics in debugfs. * They're called even less than the "set direction" calls. */ static int gpiod_request_commit(struct gpio_desc *desc, const char *label) { unsigned int offset; int ret; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; if (test_and_set_bit(FLAG_REQUESTED, &desc->flags)) return -EBUSY; /* NOTE: gpio_request() can be called in early boot, * before IRQs are enabled, for non-sleeping (SOC) GPIOs. */ if (guard.gc->request) { offset = gpio_chip_hwgpio(desc); if (gpiochip_line_is_valid(guard.gc, offset)) ret = guard.gc->request(guard.gc, offset); else ret = -EINVAL; if (ret) goto out_clear_bit; } if (guard.gc->get_direction) gpiod_get_direction(desc); ret = desc_set_label(desc, label ? : "?"); if (ret) goto out_clear_bit; return 0; out_clear_bit: clear_bit(FLAG_REQUESTED, &desc->flags); return ret; } /* * This descriptor validation needs to be inserted verbatim into each * function taking a descriptor, so we need to use a preprocessor * macro to avoid endless duplication. If the desc is NULL it is an * optional GPIO and calls should just bail out. */ static int validate_desc(const struct gpio_desc *desc, const char *func) { if (!desc) return 0; if (IS_ERR(desc)) { pr_warn("%s: invalid GPIO (errorpointer)\n", func); return PTR_ERR(desc); } return 1; } #define VALIDATE_DESC(desc) do { \ int __valid = validate_desc(desc, __func__); \ if (__valid <= 0) \ return __valid; \ } while (0) #define VALIDATE_DESC_VOID(desc) do { \ int __valid = validate_desc(desc, __func__); \ if (__valid <= 0) \ return; \ } while (0) int gpiod_request(struct gpio_desc *desc, const char *label) { int ret = -EPROBE_DEFER; VALIDATE_DESC(desc); if (try_module_get(desc->gdev->owner)) { ret = gpiod_request_commit(desc, label); if (ret) module_put(desc->gdev->owner); else gpio_device_get(desc->gdev); } if (ret) gpiod_dbg(desc, "%s: status %d\n", __func__, ret); return ret; } static void gpiod_free_commit(struct gpio_desc *desc) { unsigned long flags; might_sleep(); CLASS(gpio_chip_guard, guard)(desc); flags = READ_ONCE(desc->flags); if (guard.gc && test_bit(FLAG_REQUESTED, &flags)) { if (guard.gc->free) guard.gc->free(guard.gc, gpio_chip_hwgpio(desc)); clear_bit(FLAG_ACTIVE_LOW, &flags); clear_bit(FLAG_REQUESTED, &flags); clear_bit(FLAG_OPEN_DRAIN, &flags); clear_bit(FLAG_OPEN_SOURCE, &flags); clear_bit(FLAG_PULL_UP, &flags); clear_bit(FLAG_PULL_DOWN, &flags); clear_bit(FLAG_BIAS_DISABLE, &flags); clear_bit(FLAG_EDGE_RISING, &flags); clear_bit(FLAG_EDGE_FALLING, &flags); clear_bit(FLAG_IS_HOGGED, &flags); #ifdef CONFIG_OF_DYNAMIC WRITE_ONCE(desc->hog, NULL); #endif desc_set_label(desc, NULL); WRITE_ONCE(desc->flags, flags); #ifdef CONFIG_GPIO_CDEV WRITE_ONCE(desc->debounce_period_us, 0); #endif gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_RELEASED); } } void gpiod_free(struct gpio_desc *desc) { VALIDATE_DESC_VOID(desc); gpiod_free_commit(desc); module_put(desc->gdev->owner); gpio_device_put(desc->gdev); } /** * gpiochip_dup_line_label - Get a copy of the consumer label. * @gc: GPIO chip controlling this line. * @offset: Hardware offset of the line. * * Returns: * Pointer to a copy of the consumer label if the line is requested or NULL * if it's not. If a valid pointer was returned, it must be freed using * kfree(). In case of a memory allocation error, the function returns %ENOMEM. * * Must not be called from atomic context. */ char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc; char *label; desc = gpiochip_get_desc(gc, offset); if (IS_ERR(desc)) return NULL; if (!test_bit(FLAG_REQUESTED, &desc->flags)) return NULL; guard(srcu)(&desc->gdev->desc_srcu); label = kstrdup(gpiod_get_label(desc), GFP_KERNEL); if (!label) return ERR_PTR(-ENOMEM); return label; } EXPORT_SYMBOL_GPL(gpiochip_dup_line_label); static inline const char *function_name_or_default(const char *con_id) { return con_id ?: "(default)"; } /** * gpiochip_request_own_desc - Allow GPIO chip to request its own descriptor * @gc: GPIO chip * @hwnum: hardware number of the GPIO for which to request the descriptor * @label: label for the GPIO * @lflags: lookup flags for this GPIO or 0 if default, this can be used to * specify things like line inversion semantics with the machine flags * such as GPIO_OUT_LOW * @dflags: descriptor request flags for this GPIO or 0 if default, this * can be used to specify consumer semantics such as open drain * * Function allows GPIO chip drivers to request and use their own GPIO * descriptors via gpiolib API. Difference to gpiod_request() is that this * function will not increase reference count of the GPIO chip module. This * allows the GPIO chip module to be unloaded as needed (we assume that the * GPIO chip driver handles freeing the GPIOs it has requested). * * Returns: * A pointer to the GPIO descriptor, or an ERR_PTR()-encoded negative error * code on failure. */ struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *gc, unsigned int hwnum, const char *label, enum gpio_lookup_flags lflags, enum gpiod_flags dflags) { struct gpio_desc *desc = gpiochip_get_desc(gc, hwnum); const char *name = function_name_or_default(label); int ret; if (IS_ERR(desc)) { chip_err(gc, "failed to get GPIO %s descriptor\n", name); return desc; } ret = gpiod_request_commit(desc, label); if (ret < 0) return ERR_PTR(ret); ret = gpiod_configure_flags(desc, label, lflags, dflags); if (ret) { gpiod_free_commit(desc); chip_err(gc, "setup of own GPIO %s failed\n", name); return ERR_PTR(ret); } gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_REQUESTED); return desc; } EXPORT_SYMBOL_GPL(gpiochip_request_own_desc); /** * gpiochip_free_own_desc - Free GPIO requested by the chip driver * @desc: GPIO descriptor to free * * Function frees the given GPIO requested previously with * gpiochip_request_own_desc(). */ void gpiochip_free_own_desc(struct gpio_desc *desc) { if (desc) gpiod_free_commit(desc); } EXPORT_SYMBOL_GPL(gpiochip_free_own_desc); /* * Drivers MUST set GPIO direction before making get/set calls. In * some cases this is done in early boot, before IRQs are enabled. * * As a rule these aren't called more than once (except for drivers * using the open-drain emulation idiom) so these are natural places * to accumulate extra debugging checks. Note that we can't (yet) * rely on gpio_request() having been called beforehand. */ int gpio_do_set_config(struct gpio_desc *desc, unsigned long config) { int ret; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; if (!guard.gc->set_config) return -ENOTSUPP; ret = guard.gc->set_config(guard.gc, gpio_chip_hwgpio(desc), config); #ifdef CONFIG_GPIO_CDEV /* * Special case - if we're setting debounce period, we need to store * it in the descriptor in case user-space wants to know it. */ if (!ret && pinconf_to_config_param(config) == PIN_CONFIG_INPUT_DEBOUNCE) WRITE_ONCE(desc->debounce_period_us, pinconf_to_config_argument(config)); #endif return ret; } static int gpio_set_config_with_argument(struct gpio_desc *desc, enum pin_config_param mode, u32 argument) { unsigned long config; config = pinconf_to_config_packed(mode, argument); return gpio_do_set_config(desc, config); } static int gpio_set_config_with_argument_optional(struct gpio_desc *desc, enum pin_config_param mode, u32 argument) { struct device *dev = &desc->gdev->dev; int gpio = gpio_chip_hwgpio(desc); int ret; ret = gpio_set_config_with_argument(desc, mode, argument); if (ret != -ENOTSUPP) return ret; switch (mode) { case PIN_CONFIG_PERSIST_STATE: dev_dbg(dev, "Persistence not supported for GPIO %d\n", gpio); break; default: break; } return 0; } static int gpio_set_config(struct gpio_desc *desc, enum pin_config_param mode) { return gpio_set_config_with_argument(desc, mode, 0); } static int gpio_set_bias(struct gpio_desc *desc) { enum pin_config_param bias; unsigned long flags; unsigned int arg; flags = READ_ONCE(desc->flags); if (test_bit(FLAG_BIAS_DISABLE, &flags)) bias = PIN_CONFIG_BIAS_DISABLE; else if (test_bit(FLAG_PULL_UP, &flags)) bias = PIN_CONFIG_BIAS_PULL_UP; else if (test_bit(FLAG_PULL_DOWN, &flags)) bias = PIN_CONFIG_BIAS_PULL_DOWN; else return 0; switch (bias) { case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_BIAS_PULL_UP: arg = 1; break; default: arg = 0; break; } return gpio_set_config_with_argument_optional(desc, bias, arg); } /** * gpio_set_debounce_timeout() - Set debounce timeout * @desc: GPIO descriptor to set the debounce timeout * @debounce: Debounce timeout in microseconds * * The function calls the certain GPIO driver to set debounce timeout * in the hardware. * * Returns: * 0 on success, or negative errno on failure. */ int gpio_set_debounce_timeout(struct gpio_desc *desc, unsigned int debounce) { int ret; ret = gpio_set_config_with_argument_optional(desc, PIN_CONFIG_INPUT_DEBOUNCE, debounce); if (!ret) gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); return ret; } /** * gpiod_direction_input - set the GPIO direction to input * @desc: GPIO to set to input * * Set the direction of the passed GPIO to input, such as gpiod_get_value() can * be called safely on it. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_direction_input(struct gpio_desc *desc) { int ret; VALIDATE_DESC(desc); ret = gpiod_direction_input_nonotify(desc); if (ret == 0) gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); return ret; } EXPORT_SYMBOL_GPL(gpiod_direction_input); int gpiod_direction_input_nonotify(struct gpio_desc *desc) { int ret = 0; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; /* * It is legal to have no .get() and .direction_input() specified if * the chip is output-only, but you can't specify .direction_input() * and not support the .get() operation, that doesn't make sense. */ if (!guard.gc->get && guard.gc->direction_input) { gpiod_warn(desc, "%s: missing get() but have direction_input()\n", __func__); return -EIO; } /* * If we have a .direction_input() callback, things are simple, * just call it. Else we are some input-only chip so try to check the * direction (if .get_direction() is supported) else we silently * assume we are in input mode after this. */ if (guard.gc->direction_input) { ret = guard.gc->direction_input(guard.gc, gpio_chip_hwgpio(desc)); } else if (guard.gc->get_direction && (guard.gc->get_direction(guard.gc, gpio_chip_hwgpio(desc)) != 1)) { gpiod_warn(desc, "%s: missing direction_input() operation and line is output\n", __func__); return -EIO; } if (ret == 0) { clear_bit(FLAG_IS_OUT, &desc->flags); ret = gpio_set_bias(desc); } trace_gpio_direction(desc_to_gpio(desc), 1, ret); return ret; } static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value) { int val = !!value, ret = 0; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; /* * It's OK not to specify .direction_output() if the gpiochip is * output-only, but if there is then not even a .set() operation it * is pretty tricky to drive the output line. */ if (!guard.gc->set && !guard.gc->direction_output) { gpiod_warn(desc, "%s: missing set() and direction_output() operations\n", __func__); return -EIO; } if (guard.gc->direction_output) { ret = guard.gc->direction_output(guard.gc, gpio_chip_hwgpio(desc), val); } else { /* Check that we are in output mode if we can */ if (guard.gc->get_direction && guard.gc->get_direction(guard.gc, gpio_chip_hwgpio(desc))) { gpiod_warn(desc, "%s: missing direction_output() operation\n", __func__); return -EIO; } /* * If we can't actively set the direction, we are some * output-only chip, so just drive the output as desired. */ guard.gc->set(guard.gc, gpio_chip_hwgpio(desc), val); } if (!ret) set_bit(FLAG_IS_OUT, &desc->flags); trace_gpio_value(desc_to_gpio(desc), 0, val); trace_gpio_direction(desc_to_gpio(desc), 0, ret); return ret; } /** * gpiod_direction_output_raw - set the GPIO direction to output * @desc: GPIO to set to output * @value: initial output value of the GPIO * * Set the direction of the passed GPIO to output, such as gpiod_set_value() can * be called safely on it. The initial value of the output must be specified * as raw value on the physical line without regard for the ACTIVE_LOW status. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_direction_output_raw(struct gpio_desc *desc, int value) { int ret; VALIDATE_DESC(desc); ret = gpiod_direction_output_raw_commit(desc, value); if (ret == 0) gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); return ret; } EXPORT_SYMBOL_GPL(gpiod_direction_output_raw); /** * gpiod_direction_output - set the GPIO direction to output * @desc: GPIO to set to output * @value: initial output value of the GPIO * * Set the direction of the passed GPIO to output, such as gpiod_set_value() can * be called safely on it. The initial value of the output must be specified * as the logical value of the GPIO, i.e. taking its ACTIVE_LOW status into * account. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_direction_output(struct gpio_desc *desc, int value) { int ret; VALIDATE_DESC(desc); ret = gpiod_direction_output_nonotify(desc, value); if (ret == 0) gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); return ret; } EXPORT_SYMBOL_GPL(gpiod_direction_output); int gpiod_direction_output_nonotify(struct gpio_desc *desc, int value) { unsigned long flags; int ret; flags = READ_ONCE(desc->flags); if (test_bit(FLAG_ACTIVE_LOW, &flags)) value = !value; else value = !!value; /* GPIOs used for enabled IRQs shall not be set as output */ if (test_bit(FLAG_USED_AS_IRQ, &flags) && test_bit(FLAG_IRQ_IS_ENABLED, &flags)) { gpiod_err(desc, "%s: tried to set a GPIO tied to an IRQ as output\n", __func__); return -EIO; } if (test_bit(FLAG_OPEN_DRAIN, &flags)) { /* First see if we can enable open drain in hardware */ ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_DRAIN); if (!ret) goto set_output_value; /* Emulate open drain by not actively driving the line high */ if (value) { ret = gpiod_direction_input_nonotify(desc); goto set_output_flag; } } else if (test_bit(FLAG_OPEN_SOURCE, &flags)) { ret = gpio_set_config(desc, PIN_CONFIG_DRIVE_OPEN_SOURCE); if (!ret) goto set_output_value; /* Emulate open source by not actively driving the line low */ if (!value) { ret = gpiod_direction_input_nonotify(desc); goto set_output_flag; } } else { gpio_set_config(desc, PIN_CONFIG_DRIVE_PUSH_PULL); } set_output_value: ret = gpio_set_bias(desc); if (ret) return ret; return gpiod_direction_output_raw_commit(desc, value); set_output_flag: /* * When emulating open-source or open-drain functionalities by not * actively driving the line (setting mode to input) we still need to * set the IS_OUT flag or otherwise we won't be able to set the line * value anymore. */ if (ret == 0) set_bit(FLAG_IS_OUT, &desc->flags); return ret; } /** * gpiod_enable_hw_timestamp_ns - Enable hardware timestamp in nanoseconds. * * @desc: GPIO to enable. * @flags: Flags related to GPIO edge. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_enable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags) { int ret = 0; VALIDATE_DESC(desc); CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; if (!guard.gc->en_hw_timestamp) { gpiod_warn(desc, "%s: hw ts not supported\n", __func__); return -ENOTSUPP; } ret = guard.gc->en_hw_timestamp(guard.gc, gpio_chip_hwgpio(desc), flags); if (ret) gpiod_warn(desc, "%s: hw ts request failed\n", __func__); return ret; } EXPORT_SYMBOL_GPL(gpiod_enable_hw_timestamp_ns); /** * gpiod_disable_hw_timestamp_ns - Disable hardware timestamp. * * @desc: GPIO to disable. * @flags: Flags related to GPIO edge, same value as used during enable call. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_disable_hw_timestamp_ns(struct gpio_desc *desc, unsigned long flags) { int ret = 0; VALIDATE_DESC(desc); CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; if (!guard.gc->dis_hw_timestamp) { gpiod_warn(desc, "%s: hw ts not supported\n", __func__); return -ENOTSUPP; } ret = guard.gc->dis_hw_timestamp(guard.gc, gpio_chip_hwgpio(desc), flags); if (ret) gpiod_warn(desc, "%s: hw ts release failed\n", __func__); return ret; } EXPORT_SYMBOL_GPL(gpiod_disable_hw_timestamp_ns); /** * gpiod_set_config - sets @config for a GPIO * @desc: descriptor of the GPIO for which to set the configuration * @config: Same packed config format as generic pinconf * * Returns: * 0 on success, %-ENOTSUPP if the controller doesn't support setting the * configuration. */ int gpiod_set_config(struct gpio_desc *desc, unsigned long config) { int ret; VALIDATE_DESC(desc); ret = gpio_do_set_config(desc, config); if (!ret) { /* These are the only options we notify the userspace about. */ switch (pinconf_to_config_param(config)) { case PIN_CONFIG_BIAS_DISABLE: case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_DRIVE_OPEN_DRAIN: case PIN_CONFIG_DRIVE_OPEN_SOURCE: case PIN_CONFIG_DRIVE_PUSH_PULL: case PIN_CONFIG_INPUT_DEBOUNCE: gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); break; default: break; } } return ret; } EXPORT_SYMBOL_GPL(gpiod_set_config); /** * gpiod_set_debounce - sets @debounce time for a GPIO * @desc: descriptor of the GPIO for which to set debounce time * @debounce: debounce time in microseconds * * Returns: * 0 on success, %-ENOTSUPP if the controller doesn't support setting the * debounce time. */ int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce) { unsigned long config; config = pinconf_to_config_packed(PIN_CONFIG_INPUT_DEBOUNCE, debounce); return gpiod_set_config(desc, config); } EXPORT_SYMBOL_GPL(gpiod_set_debounce); /** * gpiod_set_transitory - Lose or retain GPIO state on suspend or reset * @desc: descriptor of the GPIO for which to configure persistence * @transitory: True to lose state on suspend or reset, false for persistence * * Returns: * 0 on success, otherwise a negative error code. */ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) { VALIDATE_DESC(desc); /* * Handle FLAG_TRANSITORY first, enabling queries to gpiolib for * persistence state. */ assign_bit(FLAG_TRANSITORY, &desc->flags, transitory); /* If the driver supports it, set the persistence state now */ return gpio_set_config_with_argument_optional(desc, PIN_CONFIG_PERSIST_STATE, !transitory); } /** * gpiod_is_active_low - test whether a GPIO is active-low or not * @desc: the gpio descriptor to test * * Returns: * 1 if the GPIO is active-low, 0 otherwise. */ int gpiod_is_active_low(const struct gpio_desc *desc) { VALIDATE_DESC(desc); return test_bit(FLAG_ACTIVE_LOW, &desc->flags); } EXPORT_SYMBOL_GPL(gpiod_is_active_low); /** * gpiod_toggle_active_low - toggle whether a GPIO is active-low or not * @desc: the gpio descriptor to change */ void gpiod_toggle_active_low(struct gpio_desc *desc) { VALIDATE_DESC_VOID(desc); change_bit(FLAG_ACTIVE_LOW, &desc->flags); gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); } EXPORT_SYMBOL_GPL(gpiod_toggle_active_low); static int gpio_chip_get_value(struct gpio_chip *gc, const struct gpio_desc *desc) { return gc->get ? gc->get(gc, gpio_chip_hwgpio(desc)) : -EIO; } /* I/O calls are only valid after configuration completed; the relevant * "is this a valid GPIO" error checks should already have been done. * * "Get" operations are often inlinable as reading a pin value register, * and masking the relevant bit in that register. * * When "set" operations are inlinable, they involve writing that mask to * one register to set a low value, or a different register to set it high. * Otherwise locking is needed, so there may be little value to inlining. * *------------------------------------------------------------------------ * * IMPORTANT!!! The hot paths -- get/set value -- assume that callers * have requested the GPIO. That can include implicit requesting by * a direction setting call. Marking a gpio as requested locks its chip * in memory, guaranteeing that these table lookups need no more locking * and that gpiochip_remove() will fail. * * REVISIT when debugging, consider adding some instrumentation to ensure * that the GPIO was actually requested. */ static int gpiod_get_raw_value_commit(const struct gpio_desc *desc) { struct gpio_device *gdev; struct gpio_chip *gc; int value; /* FIXME Unable to use gpio_chip_guard due to const desc. */ gdev = desc->gdev; guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); if (!gc) return -ENODEV; value = gpio_chip_get_value(gc, desc); value = value < 0 ? value : !!value; trace_gpio_value(desc_to_gpio(desc), 1, value); return value; } static int gpio_chip_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { if (gc->get_multiple) return gc->get_multiple(gc, mask, bits); if (gc->get) { int i, value; for_each_set_bit(i, mask, gc->ngpio) { value = gc->get(gc, i); if (value < 0) return value; __assign_bit(i, bits, value); } return 0; } return -EIO; } /* The 'other' chip must be protected with its GPIO device's SRCU. */ static bool gpio_device_chip_cmp(struct gpio_device *gdev, struct gpio_chip *gc) { guard(srcu)(&gdev->srcu); return gc == srcu_dereference(gdev->chip, &gdev->srcu); } int gpiod_get_array_value_complex(bool raw, bool can_sleep, unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { int ret, i = 0; /* * Validate array_info against desc_array and its size. * It should immediately follow desc_array if both * have been obtained from the same gpiod_get_array() call. */ if (array_info && array_info->desc == desc_array && array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) WARN_ON(array_info->chip->can_sleep); ret = gpio_chip_get_multiple(array_info->chip, array_info->get_mask, value_bitmap); if (ret) return ret; if (!raw && !bitmap_empty(array_info->invert_mask, array_size)) bitmap_xor(value_bitmap, value_bitmap, array_info->invert_mask, array_size); i = find_first_zero_bit(array_info->get_mask, array_size); if (i == array_size) return 0; } else { array_info = NULL; } while (i < array_size) { DECLARE_BITMAP(fastpath_mask, FASTPATH_NGPIO); DECLARE_BITMAP(fastpath_bits, FASTPATH_NGPIO); unsigned long *mask, *bits; int first, j; CLASS(gpio_chip_guard, guard)(desc_array[i]); if (!guard.gc) return -ENODEV; if (likely(guard.gc->ngpio <= FASTPATH_NGPIO)) { mask = fastpath_mask; bits = fastpath_bits; } else { gfp_t flags = can_sleep ? GFP_KERNEL : GFP_ATOMIC; mask = bitmap_alloc(guard.gc->ngpio, flags); if (!mask) return -ENOMEM; bits = bitmap_alloc(guard.gc->ngpio, flags); if (!bits) { bitmap_free(mask); return -ENOMEM; } } bitmap_zero(mask, guard.gc->ngpio); if (!can_sleep) WARN_ON(guard.gc->can_sleep); /* collect all inputs belonging to the same chip */ first = i; do { const struct gpio_desc *desc = desc_array[i]; int hwgpio = gpio_chip_hwgpio(desc); __set_bit(hwgpio, mask); i++; if (array_info) i = find_next_zero_bit(array_info->get_mask, array_size, i); } while ((i < array_size) && gpio_device_chip_cmp(desc_array[i]->gdev, guard.gc)); ret = gpio_chip_get_multiple(guard.gc, mask, bits); if (ret) { if (mask != fastpath_mask) bitmap_free(mask); if (bits != fastpath_bits) bitmap_free(bits); return ret; } for (j = first; j < i; ) { const struct gpio_desc *desc = desc_array[j]; int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(hwgpio, bits); if (!raw && test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; __assign_bit(j, value_bitmap, value); trace_gpio_value(desc_to_gpio(desc), 1, value); j++; if (array_info) j = find_next_zero_bit(array_info->get_mask, i, j); } if (mask != fastpath_mask) bitmap_free(mask); if (bits != fastpath_bits) bitmap_free(bits); } return 0; } /** * gpiod_get_raw_value() - return a gpio's raw value * @desc: gpio whose value will be returned * * Returns: * The GPIO's raw value, i.e. the value of the physical line disregarding * its ACTIVE_LOW status, or negative errno on failure. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ int gpiod_get_raw_value(const struct gpio_desc *desc) { VALIDATE_DESC(desc); /* Should be using gpiod_get_raw_value_cansleep() */ WARN_ON(desc->gdev->can_sleep); return gpiod_get_raw_value_commit(desc); } EXPORT_SYMBOL_GPL(gpiod_get_raw_value); /** * gpiod_get_value() - return a gpio's value * @desc: gpio whose value will be returned * * Returns: * The GPIO's logical value, i.e. taking the ACTIVE_LOW status into * account, or negative errno on failure. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ int gpiod_get_value(const struct gpio_desc *desc) { int value; VALIDATE_DESC(desc); /* Should be using gpiod_get_value_cansleep() */ WARN_ON(desc->gdev->can_sleep); value = gpiod_get_raw_value_commit(desc); if (value < 0) return value; if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; } EXPORT_SYMBOL_GPL(gpiod_get_value); /** * gpiod_get_raw_array_value() - read raw values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. * * This function can be called from contexts where we cannot sleep, * and it will complain if the GPIO chip functions potentially sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_get_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(true, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value); /** * gpiod_get_array_value() - read values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. * * This function can be called from contexts where we cannot sleep, * and it will complain if the GPIO chip functions potentially sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_get_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(false, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_array_value); /* * gpio_set_open_drain_value_commit() - Set the open drain gpio's value. * @desc: gpio descriptor whose state need to be set. * @value: Non-zero for setting it HIGH otherwise it will set to LOW. */ static void gpio_set_open_drain_value_commit(struct gpio_desc *desc, bool value) { int ret = 0, offset = gpio_chip_hwgpio(desc); CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return; if (value) { ret = guard.gc->direction_input(guard.gc, offset); } else { ret = guard.gc->direction_output(guard.gc, offset, 0); if (!ret) set_bit(FLAG_IS_OUT, &desc->flags); } trace_gpio_direction(desc_to_gpio(desc), value, ret); if (ret < 0) gpiod_err(desc, "%s: Error in set_value for open drain err %d\n", __func__, ret); } /* * _gpio_set_open_source_value() - Set the open source gpio's value. * @desc: gpio descriptor whose state need to be set. * @value: Non-zero for setting it HIGH otherwise it will set to LOW. */ static void gpio_set_open_source_value_commit(struct gpio_desc *desc, bool value) { int ret = 0, offset = gpio_chip_hwgpio(desc); CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return; if (value) { ret = guard.gc->direction_output(guard.gc, offset, 1); if (!ret) set_bit(FLAG_IS_OUT, &desc->flags); } else { ret = guard.gc->direction_input(guard.gc, offset); } trace_gpio_direction(desc_to_gpio(desc), !value, ret); if (ret < 0) gpiod_err(desc, "%s: Error in set_value for open source err %d\n", __func__, ret); } static void gpiod_set_raw_value_commit(struct gpio_desc *desc, bool value) { CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return; trace_gpio_value(desc_to_gpio(desc), 0, value); guard.gc->set(guard.gc, gpio_chip_hwgpio(desc), value); } /* * set multiple outputs on the same chip; * use the chip's set_multiple function if available; * otherwise set the outputs sequentially; * @chip: the GPIO chip we operate on * @mask: bit mask array; one bit per output; BITS_PER_LONG bits per word * defines which outputs are to be changed * @bits: bit value array; one bit per output; BITS_PER_LONG bits per word * defines the values the outputs specified by mask are to be set to */ static void gpio_chip_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { if (gc->set_multiple) { gc->set_multiple(gc, mask, bits); } else { unsigned int i; /* set outputs if the corresponding mask bit is set */ for_each_set_bit(i, mask, gc->ngpio) gc->set(gc, i, test_bit(i, bits)); } } int gpiod_set_array_value_complex(bool raw, bool can_sleep, unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { int i = 0; /* * Validate array_info against desc_array and its size. * It should immediately follow desc_array if both * have been obtained from the same gpiod_get_array() call. */ if (array_info && array_info->desc == desc_array && array_size <= array_info->size && (void *)array_info == desc_array + array_info->size) { if (!can_sleep) WARN_ON(array_info->chip->can_sleep); if (!raw && !bitmap_empty(array_info->invert_mask, array_size)) bitmap_xor(value_bitmap, value_bitmap, array_info->invert_mask, array_size); gpio_chip_set_multiple(array_info->chip, array_info->set_mask, value_bitmap); i = find_first_zero_bit(array_info->set_mask, array_size); if (i == array_size) return 0; } else { array_info = NULL; } while (i < array_size) { DECLARE_BITMAP(fastpath_mask, FASTPATH_NGPIO); DECLARE_BITMAP(fastpath_bits, FASTPATH_NGPIO); unsigned long *mask, *bits; int count = 0; CLASS(gpio_chip_guard, guard)(desc_array[i]); if (!guard.gc) return -ENODEV; if (likely(guard.gc->ngpio <= FASTPATH_NGPIO)) { mask = fastpath_mask; bits = fastpath_bits; } else { gfp_t flags = can_sleep ? GFP_KERNEL : GFP_ATOMIC; mask = bitmap_alloc(guard.gc->ngpio, flags); if (!mask) return -ENOMEM; bits = bitmap_alloc(guard.gc->ngpio, flags); if (!bits) { bitmap_free(mask); return -ENOMEM; } } bitmap_zero(mask, guard.gc->ngpio); if (!can_sleep) WARN_ON(guard.gc->can_sleep); do { struct gpio_desc *desc = desc_array[i]; int hwgpio = gpio_chip_hwgpio(desc); int value = test_bit(i, value_bitmap); /* * Pins applicable for fast input but not for * fast output processing may have been already * inverted inside the fast path, skip them. */ if (!raw && !(array_info && test_bit(i, array_info->invert_mask)) && test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; trace_gpio_value(desc_to_gpio(desc), 0, value); /* * collect all normal outputs belonging to the same chip * open drain and open source outputs are set individually */ if (test_bit(FLAG_OPEN_DRAIN, &desc->flags) && !raw) { gpio_set_open_drain_value_commit(desc, value); } else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags) && !raw) { gpio_set_open_source_value_commit(desc, value); } else { __set_bit(hwgpio, mask); __assign_bit(hwgpio, bits, value); count++; } i++; if (array_info) i = find_next_zero_bit(array_info->set_mask, array_size, i); } while ((i < array_size) && gpio_device_chip_cmp(desc_array[i]->gdev, guard.gc)); /* push collected bits to outputs */ if (count != 0) gpio_chip_set_multiple(guard.gc, mask, bits); if (mask != fastpath_mask) bitmap_free(mask); if (bits != fastpath_bits) bitmap_free(bits); } return 0; } /** * gpiod_set_raw_value() - assign a gpio's raw value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the raw value of the GPIO, i.e. the value of its physical line without * regard for its ACTIVE_LOW status. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ void gpiod_set_raw_value(struct gpio_desc *desc, int value) { VALIDATE_DESC_VOID(desc); /* Should be using gpiod_set_raw_value_cansleep() */ WARN_ON(desc->gdev->can_sleep); gpiod_set_raw_value_commit(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_raw_value); /** * gpiod_set_value_nocheck() - set a GPIO line value without checking * @desc: the descriptor to set the value on * @value: value to set * * This sets the value of a GPIO line backing a descriptor, applying * different semantic quirks like active low and open drain/source * handling. */ static void gpiod_set_value_nocheck(struct gpio_desc *desc, int value) { if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) gpio_set_open_drain_value_commit(desc, value); else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) gpio_set_open_source_value_commit(desc, value); else gpiod_set_raw_value_commit(desc, value); } /** * gpiod_set_value() - assign a gpio's value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the logical value of the GPIO, i.e. taking its ACTIVE_LOW, * OPEN_DRAIN and OPEN_SOURCE flags into account. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. */ void gpiod_set_value(struct gpio_desc *desc, int value) { VALIDATE_DESC_VOID(desc); /* Should be using gpiod_set_value_cansleep() */ WARN_ON(desc->gdev->can_sleep); gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value); /** * gpiod_set_raw_array_value() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_set_raw_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(true, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value); /** * gpiod_set_array_value() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. * * This function can be called from contexts where we cannot sleep, and will * complain if the GPIO chip functions potentially sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_set_array_value(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(false, false, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_array_value); /** * gpiod_cansleep() - report whether gpio value access may sleep * @desc: gpio to check * * Returns: * 0 for non-sleepable, 1 for sleepable, or an error code in case of error. */ int gpiod_cansleep(const struct gpio_desc *desc) { VALIDATE_DESC(desc); return desc->gdev->can_sleep; } EXPORT_SYMBOL_GPL(gpiod_cansleep); /** * gpiod_set_consumer_name() - set the consumer name for the descriptor * @desc: gpio to set the consumer name on * @name: the new consumer name * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name) { int ret; VALIDATE_DESC(desc); ret = desc_set_label(desc, name); if (ret == 0) gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_CONFIG); return ret; } EXPORT_SYMBOL_GPL(gpiod_set_consumer_name); /** * gpiod_to_irq() - return the IRQ corresponding to a GPIO * @desc: gpio whose IRQ will be returned (already requested) * * Returns: * The IRQ corresponding to the passed GPIO, or an error code in case of error. */ int gpiod_to_irq(const struct gpio_desc *desc) { struct gpio_device *gdev; struct gpio_chip *gc; int offset; /* * Cannot VALIDATE_DESC() here as gpiod_to_irq() consumer semantics * requires this function to not return zero on an invalid descriptor * but rather a negative error number. */ if (IS_ERR_OR_NULL(desc)) return -EINVAL; gdev = desc->gdev; /* FIXME Cannot use gpio_chip_guard due to const desc. */ guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); if (!gc) return -ENODEV; offset = gpio_chip_hwgpio(desc); if (gc->to_irq) { int retirq = gc->to_irq(gc, offset); /* Zero means NO_IRQ */ if (!retirq) return -ENXIO; return retirq; } #ifdef CONFIG_GPIOLIB_IRQCHIP if (gc->irq.chip) { /* * Avoid race condition with other code, which tries to lookup * an IRQ before the irqchip has been properly registered, * i.e. while gpiochip is still being brought up. */ return -EPROBE_DEFER; } #endif return -ENXIO; } EXPORT_SYMBOL_GPL(gpiod_to_irq); /** * gpiochip_lock_as_irq() - lock a GPIO to be used as IRQ * @gc: the chip the GPIO to lock belongs to * @offset: the offset of the GPIO to lock as IRQ * * This is used directly by GPIO drivers that want to lock down * a certain GPIO line to be used for IRQs. * * Returns: * 0 on success, or negative errno on failure. */ int gpiochip_lock_as_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc; desc = gpiochip_get_desc(gc, offset); if (IS_ERR(desc)) return PTR_ERR(desc); /* * If it's fast: flush the direction setting if something changed * behind our back */ if (!gc->can_sleep && gc->get_direction) { int dir = gpiod_get_direction(desc); if (dir < 0) { chip_err(gc, "%s: cannot get GPIO direction\n", __func__); return dir; } } /* To be valid for IRQ the line needs to be input or open drain */ if (test_bit(FLAG_IS_OUT, &desc->flags) && !test_bit(FLAG_OPEN_DRAIN, &desc->flags)) { chip_err(gc, "%s: tried to flag a GPIO set as output for IRQ\n", __func__); return -EIO; } set_bit(FLAG_USED_AS_IRQ, &desc->flags); set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); return 0; } EXPORT_SYMBOL_GPL(gpiochip_lock_as_irq); /** * gpiochip_unlock_as_irq() - unlock a GPIO used as IRQ * @gc: the chip the GPIO to lock belongs to * @offset: the offset of the GPIO to lock as IRQ * * This is used directly by GPIO drivers that want to indicate * that a certain GPIO is no longer used exclusively for IRQ. */ void gpiochip_unlock_as_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc; desc = gpiochip_get_desc(gc, offset); if (IS_ERR(desc)) return; clear_bit(FLAG_USED_AS_IRQ, &desc->flags); clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); } EXPORT_SYMBOL_GPL(gpiochip_unlock_as_irq); void gpiochip_disable_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) clear_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); } EXPORT_SYMBOL_GPL(gpiochip_disable_irq); void gpiochip_enable_irq(struct gpio_chip *gc, unsigned int offset) { struct gpio_desc *desc = gpiochip_get_desc(gc, offset); if (!IS_ERR(desc) && !WARN_ON(!test_bit(FLAG_USED_AS_IRQ, &desc->flags))) { /* * We must not be output when using IRQ UNLESS we are * open drain. */ WARN_ON(test_bit(FLAG_IS_OUT, &desc->flags) && !test_bit(FLAG_OPEN_DRAIN, &desc->flags)); set_bit(FLAG_IRQ_IS_ENABLED, &desc->flags); } } EXPORT_SYMBOL_GPL(gpiochip_enable_irq); bool gpiochip_line_is_irq(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return test_bit(FLAG_USED_AS_IRQ, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_irq); int gpiochip_reqres_irq(struct gpio_chip *gc, unsigned int offset) { int ret; if (!try_module_get(gc->gpiodev->owner)) return -ENODEV; ret = gpiochip_lock_as_irq(gc, offset); if (ret) { chip_err(gc, "unable to lock HW IRQ %u for IRQ\n", offset); module_put(gc->gpiodev->owner); return ret; } return 0; } EXPORT_SYMBOL_GPL(gpiochip_reqres_irq); void gpiochip_relres_irq(struct gpio_chip *gc, unsigned int offset) { gpiochip_unlock_as_irq(gc, offset); module_put(gc->gpiodev->owner); } EXPORT_SYMBOL_GPL(gpiochip_relres_irq); bool gpiochip_line_is_open_drain(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return test_bit(FLAG_OPEN_DRAIN, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_drain); bool gpiochip_line_is_open_source(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return test_bit(FLAG_OPEN_SOURCE, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_open_source); bool gpiochip_line_is_persistent(struct gpio_chip *gc, unsigned int offset) { if (offset >= gc->ngpio) return false; return !test_bit(FLAG_TRANSITORY, &gc->gpiodev->descs[offset].flags); } EXPORT_SYMBOL_GPL(gpiochip_line_is_persistent); /** * gpiod_get_raw_value_cansleep() - return a gpio's raw value * @desc: gpio whose value will be returned * * Returns: * The GPIO's raw value, i.e. the value of the physical line disregarding * its ACTIVE_LOW status, or negative errno on failure. * * This function is to be called from contexts that can sleep. */ int gpiod_get_raw_value_cansleep(const struct gpio_desc *desc) { might_sleep(); VALIDATE_DESC(desc); return gpiod_get_raw_value_commit(desc); } EXPORT_SYMBOL_GPL(gpiod_get_raw_value_cansleep); /** * gpiod_get_value_cansleep() - return a gpio's value * @desc: gpio whose value will be returned * * Returns: * The GPIO's logical value, i.e. taking the ACTIVE_LOW status into * account, or negative errno on failure. * * This function is to be called from contexts that can sleep. */ int gpiod_get_value_cansleep(const struct gpio_desc *desc) { int value; might_sleep(); VALIDATE_DESC(desc); value = gpiod_get_raw_value_commit(desc); if (value < 0) return value; if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) value = !value; return value; } EXPORT_SYMBOL_GPL(gpiod_get_value_cansleep); /** * gpiod_get_raw_array_value_cansleep() - read raw values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. * * This function is to be called from contexts that can sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_get_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(true, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_raw_array_value_cansleep); /** * gpiod_get_array_value_cansleep() - read values from an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be read * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap to store the read values * * Read the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. * * This function is to be called from contexts that can sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_get_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_get_array_value_complex(false, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_get_array_value_cansleep); /** * gpiod_set_raw_value_cansleep() - assign a gpio's raw value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the raw value of the GPIO, i.e. the value of its physical line without * regard for its ACTIVE_LOW status. * * This function is to be called from contexts that can sleep. */ void gpiod_set_raw_value_cansleep(struct gpio_desc *desc, int value) { might_sleep(); VALIDATE_DESC_VOID(desc); gpiod_set_raw_value_commit(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_raw_value_cansleep); /** * gpiod_set_value_cansleep() - assign a gpio's value * @desc: gpio whose value will be assigned * @value: value to assign * * Set the logical value of the GPIO, i.e. taking its ACTIVE_LOW status into * account * * This function is to be called from contexts that can sleep. */ void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) { might_sleep(); VALIDATE_DESC_VOID(desc); gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep); /** * gpiod_set_raw_array_value_cansleep() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the raw values of the GPIOs, i.e. the values of the physical lines * without regard for their ACTIVE_LOW status. * * This function is to be called from contexts that can sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_set_raw_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(true, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value_cansleep); /** * gpiod_add_lookup_tables() - register GPIO device consumers * @tables: list of tables of consumers to register * @n: number of tables in the list */ void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) { unsigned int i; mutex_lock(&gpio_lookup_lock); for (i = 0; i < n; i++) list_add_tail(&tables[i]->list, &gpio_lookup_list); mutex_unlock(&gpio_lookup_lock); } /** * gpiod_set_array_value_cansleep() - assign values to an array of GPIOs * @array_size: number of elements in the descriptor array / value bitmap * @desc_array: array of GPIO descriptors whose values will be assigned * @array_info: information on applicability of fast bitmap processing path * @value_bitmap: bitmap of values to assign * * Set the logical values of the GPIOs, i.e. taking their ACTIVE_LOW status * into account. * * This function is to be called from contexts that can sleep. * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_set_array_value_cansleep(unsigned int array_size, struct gpio_desc **desc_array, struct gpio_array *array_info, unsigned long *value_bitmap) { might_sleep(); if (!desc_array) return -EINVAL; return gpiod_set_array_value_complex(false, true, array_size, desc_array, array_info, value_bitmap); } EXPORT_SYMBOL_GPL(gpiod_set_array_value_cansleep); void gpiod_line_state_notify(struct gpio_desc *desc, unsigned long action) { atomic_notifier_call_chain(&desc->gdev->line_state_notifier, action, desc); } /** * gpiod_add_lookup_table() - register GPIO device consumers * @table: table of consumers to register */ void gpiod_add_lookup_table(struct gpiod_lookup_table *table) { gpiod_add_lookup_tables(&table, 1); } EXPORT_SYMBOL_GPL(gpiod_add_lookup_table); /** * gpiod_remove_lookup_table() - unregister GPIO device consumers * @table: table of consumers to unregister */ void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) { /* Nothing to remove */ if (!table) return; mutex_lock(&gpio_lookup_lock); list_del(&table->list); mutex_unlock(&gpio_lookup_lock); } EXPORT_SYMBOL_GPL(gpiod_remove_lookup_table); /** * gpiod_add_hogs() - register a set of GPIO hogs from machine code * @hogs: table of gpio hog entries with a zeroed sentinel at the end */ void gpiod_add_hogs(struct gpiod_hog *hogs) { struct gpiod_hog *hog; mutex_lock(&gpio_machine_hogs_mutex); for (hog = &hogs[0]; hog->chip_label; hog++) { list_add_tail(&hog->list, &gpio_machine_hogs); /* * The chip may have been registered earlier, so check if it * exists and, if so, try to hog the line now. */ struct gpio_device *gdev __free(gpio_device_put) = gpio_device_find_by_label(hog->chip_label); if (gdev) gpiochip_machine_hog(gpio_device_get_chip(gdev), hog); } mutex_unlock(&gpio_machine_hogs_mutex); } EXPORT_SYMBOL_GPL(gpiod_add_hogs); void gpiod_remove_hogs(struct gpiod_hog *hogs) { struct gpiod_hog *hog; mutex_lock(&gpio_machine_hogs_mutex); for (hog = &hogs[0]; hog->chip_label; hog++) list_del(&hog->list); mutex_unlock(&gpio_machine_hogs_mutex); } EXPORT_SYMBOL_GPL(gpiod_remove_hogs); static struct gpiod_lookup_table *gpiod_find_lookup_table(struct device *dev) { const char *dev_id = dev ? dev_name(dev) : NULL; struct gpiod_lookup_table *table; list_for_each_entry(table, &gpio_lookup_list, list) { if (table->dev_id && dev_id) { /* * Valid strings on both ends, must be identical to have * a match */ if (!strcmp(table->dev_id, dev_id)) return table; } else { /* * One of the pointers is NULL, so both must be to have * a match */ if (dev_id == table->dev_id) return table; } } return NULL; } static struct gpio_desc *gpiod_find(struct device *dev, const char *con_id, unsigned int idx, unsigned long *flags) { struct gpio_desc *desc = ERR_PTR(-ENOENT); struct gpiod_lookup_table *table; struct gpiod_lookup *p; struct gpio_chip *gc; guard(mutex)(&gpio_lookup_lock); table = gpiod_find_lookup_table(dev); if (!table) return desc; for (p = &table->table[0]; p->key; p++) { /* idx must always match exactly */ if (p->idx != idx) continue; /* If the lookup entry has a con_id, require exact match */ if (p->con_id && (!con_id || strcmp(p->con_id, con_id))) continue; if (p->chip_hwnum == U16_MAX) { desc = gpio_name_to_desc(p->key); if (desc) { *flags = p->flags; return desc; } dev_warn(dev, "cannot find GPIO line %s, deferring\n", p->key); return ERR_PTR(-EPROBE_DEFER); } struct gpio_device *gdev __free(gpio_device_put) = gpio_device_find_by_label(p->key); if (!gdev) { /* * As the lookup table indicates a chip with * p->key should exist, assume it may * still appear later and let the interested * consumer be probed again or let the Deferred * Probe infrastructure handle the error. */ dev_warn(dev, "cannot find GPIO chip %s, deferring\n", p->key); return ERR_PTR(-EPROBE_DEFER); } gc = gpio_device_get_chip(gdev); if (gc->ngpio <= p->chip_hwnum) { dev_err(dev, "requested GPIO %u (%u) is out of range [0..%u] for chip %s\n", idx, p->chip_hwnum, gc->ngpio - 1, gc->label); return ERR_PTR(-EINVAL); } desc = gpio_device_get_desc(gdev, p->chip_hwnum); *flags = p->flags; return desc; } return desc; } static int platform_gpio_count(struct device *dev, const char *con_id) { struct gpiod_lookup_table *table; struct gpiod_lookup *p; unsigned int count = 0; scoped_guard(mutex, &gpio_lookup_lock) { table = gpiod_find_lookup_table(dev); if (!table) return -ENOENT; for (p = &table->table[0]; p->key; p++) { if ((con_id && p->con_id && !strcmp(con_id, p->con_id)) || (!con_id && !p->con_id)) count++; } } if (!count) return -ENOENT; return count; } static struct gpio_desc *gpiod_find_by_fwnode(struct fwnode_handle *fwnode, struct device *consumer, const char *con_id, unsigned int idx, enum gpiod_flags *flags, unsigned long *lookupflags) { const char *name = function_name_or_default(con_id); struct gpio_desc *desc = ERR_PTR(-ENOENT); if (is_of_node(fwnode)) { dev_dbg(consumer, "using DT '%pfw' for '%s' GPIO lookup\n", fwnode, name); desc = of_find_gpio(to_of_node(fwnode), con_id, idx, lookupflags); } else if (is_acpi_node(fwnode)) { dev_dbg(consumer, "using ACPI '%pfw' for '%s' GPIO lookup\n", fwnode, name); desc = acpi_find_gpio(fwnode, con_id, idx, flags, lookupflags); } else if (is_software_node(fwnode)) { dev_dbg(consumer, "using swnode '%pfw' for '%s' GPIO lookup\n", fwnode, name); desc = swnode_find_gpio(fwnode, con_id, idx, lookupflags); } return desc; } struct gpio_desc *gpiod_find_and_request(struct device *consumer, struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, enum gpiod_flags flags, const char *label, bool platform_lookup_allowed) { unsigned long lookupflags = GPIO_LOOKUP_FLAGS_DEFAULT; const char *name = function_name_or_default(con_id); /* * scoped_guard() is implemented as a for loop, meaning static * analyzers will complain about these two not being initialized. */ struct gpio_desc *desc = NULL; int ret = 0; scoped_guard(srcu, &gpio_devices_srcu) { desc = gpiod_find_by_fwnode(fwnode, consumer, con_id, idx, &flags, &lookupflags); if (gpiod_not_found(desc) && platform_lookup_allowed) { /* * Either we are not using DT or ACPI, or their lookup * did not return a result. In that case, use platform * lookup as a fallback. */ dev_dbg(consumer, "using lookup tables for GPIO lookup\n"); desc = gpiod_find(consumer, con_id, idx, &lookupflags); } if (IS_ERR(desc)) { dev_dbg(consumer, "No GPIO consumer %s found\n", name); return desc; } /* * If a connection label was passed use that, else attempt to use * the device name as label */ ret = gpiod_request(desc, label); } if (ret) { if (!(ret == -EBUSY && flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE)) return ERR_PTR(ret); /* * This happens when there are several consumers for * the same GPIO line: we just return here without * further initialization. It is a bit of a hack. * This is necessary to support fixed regulators. * * FIXME: Make this more sane and safe. */ dev_info(consumer, "nonexclusive access to GPIO for %s\n", name); return desc; } ret = gpiod_configure_flags(desc, con_id, lookupflags, flags); if (ret < 0) { gpiod_put(desc); dev_err(consumer, "setup of GPIO %s failed: %d\n", name, ret); return ERR_PTR(ret); } gpiod_line_state_notify(desc, GPIO_V2_LINE_CHANGED_REQUESTED); return desc; } /** * fwnode_gpiod_get_index - obtain a GPIO from firmware node * @fwnode: handle of the firmware node * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain for the consumer * @flags: GPIO initialization flags * @label: label to attach to the requested GPIO * * This function can be used for drivers that get their configuration * from opaque firmware. * * The function properly finds the corresponding GPIO using whatever is the * underlying firmware interface and then makes sure that the GPIO * descriptor is requested before it is returned to the caller. * * Returns: * On successful request the GPIO pin is configured in accordance with * provided @flags. * * In case of error an ERR_PTR() is returned. */ struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, const char *label) { return gpiod_find_and_request(NULL, fwnode, con_id, index, flags, label, false); } EXPORT_SYMBOL_GPL(fwnode_gpiod_get_index); /** * gpiod_count - return the number of GPIOs associated with a device / function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * * Returns: * The number of GPIOs associated with a device / function or -ENOENT if no * GPIO has been assigned to the requested function. */ int gpiod_count(struct device *dev, const char *con_id) { const struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; int count = -ENOENT; if (is_of_node(fwnode)) count = of_gpio_count(fwnode, con_id); else if (is_acpi_node(fwnode)) count = acpi_gpio_count(fwnode, con_id); else if (is_software_node(fwnode)) count = swnode_gpio_count(fwnode, con_id); if (count < 0) count = platform_gpio_count(dev, con_id); return count; } EXPORT_SYMBOL_GPL(gpiod_count); /** * gpiod_get - obtain a GPIO for a given GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, -ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check gpiod_get(struct device *dev, const char *con_id, enum gpiod_flags flags) { return gpiod_get_index(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(gpiod_get); /** * gpiod_get_optional - obtain an optional GPIO for a given GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * This is equivalent to gpiod_get(), except that when no GPIO was assigned to * the requested function it will return NULL. This is convenient for drivers * that need to handle optional GPIOs. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, NULL if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check gpiod_get_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { return gpiod_get_index_optional(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(gpiod_get_optional); /** * gpiod_configure_flags - helper function to configure a given GPIO * @desc: gpio whose value will be assigned * @con_id: function within the GPIO consumer * @lflags: bitmask of gpio_lookup_flags GPIO_* values - returned from * of_find_gpio() or of_get_gpio_hog() * @dflags: gpiod_flags - optional GPIO initialization flags * * Returns: * 0 on success, -ENOENT if no GPIO has been assigned to the * requested function and/or index, or another IS_ERR() code if an error * occurred while trying to acquire the GPIO. */ int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id, unsigned long lflags, enum gpiod_flags dflags) { const char *name = function_name_or_default(con_id); int ret; if (lflags & GPIO_ACTIVE_LOW) set_bit(FLAG_ACTIVE_LOW, &desc->flags); if (lflags & GPIO_OPEN_DRAIN) set_bit(FLAG_OPEN_DRAIN, &desc->flags); else if (dflags & GPIOD_FLAGS_BIT_OPEN_DRAIN) { /* * This enforces open drain mode from the consumer side. * This is necessary for some busses like I2C, but the lookup * should *REALLY* have specified them as open drain in the * first place, so print a little warning here. */ set_bit(FLAG_OPEN_DRAIN, &desc->flags); gpiod_warn(desc, "enforced open drain please flag it properly in DT/ACPI DSDT/board file\n"); } if (lflags & GPIO_OPEN_SOURCE) set_bit(FLAG_OPEN_SOURCE, &desc->flags); if (((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DOWN)) || ((lflags & GPIO_PULL_UP) && (lflags & GPIO_PULL_DISABLE)) || ((lflags & GPIO_PULL_DOWN) && (lflags & GPIO_PULL_DISABLE))) { gpiod_err(desc, "multiple pull-up, pull-down or pull-disable enabled, invalid configuration\n"); return -EINVAL; } if (lflags & GPIO_PULL_UP) set_bit(FLAG_PULL_UP, &desc->flags); else if (lflags & GPIO_PULL_DOWN) set_bit(FLAG_PULL_DOWN, &desc->flags); else if (lflags & GPIO_PULL_DISABLE) set_bit(FLAG_BIAS_DISABLE, &desc->flags); ret = gpiod_set_transitory(desc, (lflags & GPIO_TRANSITORY)); if (ret < 0) return ret; /* No particular flag request, return here... */ if (!(dflags & GPIOD_FLAGS_BIT_DIR_SET)) { gpiod_dbg(desc, "no flags found for GPIO %s\n", name); return 0; } /* Process flags */ if (dflags & GPIOD_FLAGS_BIT_DIR_OUT) ret = gpiod_direction_output_nonotify(desc, !!(dflags & GPIOD_FLAGS_BIT_DIR_VAL)); else ret = gpiod_direction_input_nonotify(desc); return ret; } /** * gpiod_get_index - obtain a GPIO from a multi-index GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @idx: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * This variant of gpiod_get() allows to access GPIOs other than the first * defined one for functions that define several GPIOs. * * Returns: * A valid GPIO descriptor, -ENOENT if no GPIO has been assigned to the * requested function and/or index, or another IS_ERR() code if an error * occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check gpiod_get_index(struct device *dev, const char *con_id, unsigned int idx, enum gpiod_flags flags) { struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; const char *devname = dev ? dev_name(dev) : "?"; const char *label = con_id ?: devname; return gpiod_find_and_request(dev, fwnode, con_id, idx, flags, label, true); } EXPORT_SYMBOL_GPL(gpiod_get_index); /** * gpiod_get_index_optional - obtain an optional GPIO from a multi-index GPIO * function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * This is equivalent to gpiod_get_index(), except that when no GPIO with the * specified index was assigned to the requested function it will return NULL. * This is convenient for drivers that need to handle optional GPIOs. * * Returns: * A valid GPIO descriptor, NULL if no GPIO has been assigned to the * requested function and/or index, or another IS_ERR() code if an error * occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check gpiod_get_index_optional(struct device *dev, const char *con_id, unsigned int index, enum gpiod_flags flags) { struct gpio_desc *desc; desc = gpiod_get_index(dev, con_id, index, flags); if (gpiod_not_found(desc)) return NULL; return desc; } EXPORT_SYMBOL_GPL(gpiod_get_index_optional); /** * gpiod_hog - Hog the specified GPIO desc given the provided flags * @desc: gpio whose value will be assigned * @name: gpio line name * @lflags: bitmask of gpio_lookup_flags GPIO_* values - returned from * of_find_gpio() or of_get_gpio_hog() * @dflags: gpiod_flags - optional GPIO initialization flags * * Returns: * 0 on success, or negative errno on failure. */ int gpiod_hog(struct gpio_desc *desc, const char *name, unsigned long lflags, enum gpiod_flags dflags) { struct gpio_device *gdev = desc->gdev; struct gpio_desc *local_desc; int hwnum; int ret; CLASS(gpio_chip_guard, guard)(desc); if (!guard.gc) return -ENODEV; if (test_and_set_bit(FLAG_IS_HOGGED, &desc->flags)) return 0; hwnum = gpio_chip_hwgpio(desc); local_desc = gpiochip_request_own_desc(guard.gc, hwnum, name, lflags, dflags); if (IS_ERR(local_desc)) { clear_bit(FLAG_IS_HOGGED, &desc->flags); ret = PTR_ERR(local_desc); pr_err("requesting hog GPIO %s (chip %s, offset %d) failed, %d\n", name, gdev->label, hwnum, ret); return ret; } gpiod_dbg(desc, "hogged as %s%s\n", (dflags & GPIOD_FLAGS_BIT_DIR_OUT) ? "output" : "input", (dflags & GPIOD_FLAGS_BIT_DIR_OUT) ? (dflags & GPIOD_FLAGS_BIT_DIR_VAL) ? "/high" : "/low" : ""); return 0; } /** * gpiochip_free_hogs - Scan gpio-controller chip and release GPIO hog * @gc: gpio chip to act on */ static void gpiochip_free_hogs(struct gpio_chip *gc) { struct gpio_desc *desc; for_each_gpio_desc_with_flag(gc, desc, FLAG_IS_HOGGED) gpiochip_free_own_desc(desc); } /** * gpiod_get_array - obtain multiple GPIOs from a multi-index GPIO function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * This function acquires all the GPIOs defined under a given function. * * Returns: * The GPIO descriptors corresponding to the function @con_id of device * dev, -ENOENT if no GPIO has been assigned to the requested function, * or another IS_ERR() code if an error occurred while trying to acquire * the GPIOs. */ struct gpio_descs *__must_check gpiod_get_array(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_desc *desc; struct gpio_descs *descs; struct gpio_array *array_info = NULL; struct gpio_chip *gc; int count, bitmap_size; size_t descs_size; count = gpiod_count(dev, con_id); if (count < 0) return ERR_PTR(count); descs_size = struct_size(descs, desc, count); descs = kzalloc(descs_size, GFP_KERNEL); if (!descs) return ERR_PTR(-ENOMEM); for (descs->ndescs = 0; descs->ndescs < count; descs->ndescs++) { desc = gpiod_get_index(dev, con_id, descs->ndescs, flags); if (IS_ERR(desc)) { gpiod_put_array(descs); return ERR_CAST(desc); } descs->desc[descs->ndescs] = desc; gc = gpiod_to_chip(desc); /* * If pin hardware number of array member 0 is also 0, select * its chip as a candidate for fast bitmap processing path. */ if (descs->ndescs == 0 && gpio_chip_hwgpio(desc) == 0) { struct gpio_descs *array; bitmap_size = BITS_TO_LONGS(gc->ngpio > count ? gc->ngpio : count); array = krealloc(descs, descs_size + struct_size(array_info, invert_mask, 3 * bitmap_size), GFP_KERNEL | __GFP_ZERO); if (!array) { gpiod_put_array(descs); return ERR_PTR(-ENOMEM); } descs = array; array_info = (void *)descs + descs_size; array_info->get_mask = array_info->invert_mask + bitmap_size; array_info->set_mask = array_info->get_mask + bitmap_size; array_info->desc = descs->desc; array_info->size = count; array_info->chip = gc; bitmap_set(array_info->get_mask, descs->ndescs, count - descs->ndescs); bitmap_set(array_info->set_mask, descs->ndescs, count - descs->ndescs); descs->info = array_info; } /* If there is no cache for fast bitmap processing path, continue */ if (!array_info) continue; /* Unmark array members which don't belong to the 'fast' chip */ if (array_info->chip != gc) { __clear_bit(descs->ndescs, array_info->get_mask); __clear_bit(descs->ndescs, array_info->set_mask); } /* * Detect array members which belong to the 'fast' chip * but their pins are not in hardware order. */ else if (gpio_chip_hwgpio(desc) != descs->ndescs) { /* * Don't use fast path if all array members processed so * far belong to the same chip as this one but its pin * hardware number is different from its array index. */ if (bitmap_full(array_info->get_mask, descs->ndescs)) { array_info = NULL; } else { __clear_bit(descs->ndescs, array_info->get_mask); __clear_bit(descs->ndescs, array_info->set_mask); } } else { /* Exclude open drain or open source from fast output */ if (gpiochip_line_is_open_drain(gc, descs->ndescs) || gpiochip_line_is_open_source(gc, descs->ndescs)) __clear_bit(descs->ndescs, array_info->set_mask); /* Identify 'fast' pins which require invertion */ if (gpiod_is_active_low(desc)) __set_bit(descs->ndescs, array_info->invert_mask); } } if (array_info) dev_dbg(dev, "GPIO array info: chip=%s, size=%d, get_mask=%lx, set_mask=%lx, invert_mask=%lx\n", array_info->chip->label, array_info->size, *array_info->get_mask, *array_info->set_mask, *array_info->invert_mask); return descs; } EXPORT_SYMBOL_GPL(gpiod_get_array); /** * gpiod_get_array_optional - obtain multiple GPIOs from a multi-index GPIO * function * @dev: GPIO consumer, can be NULL for system-global GPIOs * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * This is equivalent to gpiod_get_array(), except that when no GPIO was * assigned to the requested function it will return NULL. * * Returns: * The GPIO descriptors corresponding to the function @con_id of device * dev, NULL if no GPIO has been assigned to the requested function, * or another IS_ERR() code if an error occurred while trying to acquire * the GPIOs. */ struct gpio_descs *__must_check gpiod_get_array_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs *descs; descs = gpiod_get_array(dev, con_id, flags); if (gpiod_not_found(descs)) return NULL; return descs; } EXPORT_SYMBOL_GPL(gpiod_get_array_optional); /** * gpiod_put - dispose of a GPIO descriptor * @desc: GPIO descriptor to dispose of * * No descriptor can be used after gpiod_put() has been called on it. */ void gpiod_put(struct gpio_desc *desc) { if (desc) gpiod_free(desc); } EXPORT_SYMBOL_GPL(gpiod_put); /** * gpiod_put_array - dispose of multiple GPIO descriptors * @descs: struct gpio_descs containing an array of descriptors */ void gpiod_put_array(struct gpio_descs *descs) { unsigned int i; for (i = 0; i < descs->ndescs; i++) gpiod_put(descs->desc[i]); kfree(descs); } EXPORT_SYMBOL_GPL(gpiod_put_array); static int gpio_stub_drv_probe(struct device *dev) { /* * The DT node of some GPIO chips have a "compatible" property, but * never have a struct device added and probed by a driver to register * the GPIO chip with gpiolib. In such cases, fw_devlink=on will cause * the consumers of the GPIO chip to get probe deferred forever because * they will be waiting for a device associated with the GPIO chip * firmware node to get added and bound to a driver. * * To allow these consumers to probe, we associate the struct * gpio_device of the GPIO chip with the firmware node and then simply * bind it to this stub driver. */ return 0; } static struct device_driver gpio_stub_drv = { .name = "gpio_stub_drv", .bus = &gpio_bus_type, .probe = gpio_stub_drv_probe, }; static int __init gpiolib_dev_init(void) { int ret; /* Register GPIO sysfs bus */ ret = bus_register(&gpio_bus_type); if (ret < 0) { pr_err("gpiolib: could not register GPIO bus type\n"); return ret; } ret = driver_register(&gpio_stub_drv); if (ret < 0) { pr_err("gpiolib: could not register GPIO stub driver\n"); bus_unregister(&gpio_bus_type); return ret; } ret = alloc_chrdev_region(&gpio_devt, 0, GPIO_DEV_MAX, GPIOCHIP_NAME); if (ret < 0) { pr_err("gpiolib: failed to allocate char dev region\n"); driver_unregister(&gpio_stub_drv); bus_unregister(&gpio_bus_type); return ret; } gpiolib_initialized = true; gpiochip_setup_devs(); #if IS_ENABLED(CONFIG_OF_DYNAMIC) && IS_ENABLED(CONFIG_OF_GPIO) WARN_ON(of_reconfig_notifier_register(&gpio_of_notifier)); #endif /* CONFIG_OF_DYNAMIC && CONFIG_OF_GPIO */ return ret; } core_initcall(gpiolib_dev_init); #ifdef CONFIG_DEBUG_FS static void gpiolib_dbg_show(struct seq_file *s, struct gpio_device *gdev) { bool active_low, is_irq, is_out; unsigned int gpio = gdev->base; struct gpio_desc *desc; struct gpio_chip *gc; int value; guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); if (!gc) { seq_puts(s, "Underlying GPIO chip is gone\n"); return; } for_each_gpio_desc(gc, desc) { guard(srcu)(&desc->gdev->desc_srcu); is_irq = test_bit(FLAG_USED_AS_IRQ, &desc->flags); if (is_irq || test_bit(FLAG_REQUESTED, &desc->flags)) { gpiod_get_direction(desc); is_out = test_bit(FLAG_IS_OUT, &desc->flags); value = gpio_chip_get_value(gc, desc); active_low = test_bit(FLAG_ACTIVE_LOW, &desc->flags); seq_printf(s, " gpio-%-3u (%-20.20s|%-20.20s) %s %s %s%s\n", gpio, desc->name ?: "", gpiod_get_label(desc), is_out ? "out" : "in ", value >= 0 ? (value ? "hi" : "lo") : "? ", is_irq ? "IRQ " : "", active_low ? "ACTIVE LOW" : ""); } else if (desc->name) { seq_printf(s, " gpio-%-3u (%-20.20s)\n", gpio, desc->name); } gpio++; } } struct gpiolib_seq_priv { bool newline; int idx; }; static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos) { struct gpiolib_seq_priv *priv; struct gpio_device *gdev; loff_t index = *pos; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return NULL; s->private = priv; if (*pos > 0) priv->newline = true; priv->idx = srcu_read_lock(&gpio_devices_srcu); list_for_each_entry_srcu(gdev, &gpio_devices, list, srcu_read_lock_held(&gpio_devices_srcu)) { if (index-- == 0) return gdev; } return NULL; } static void *gpiolib_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct gpiolib_seq_priv *priv = s->private; struct gpio_device *gdev = v, *next; next = list_entry_rcu(gdev->list.next, struct gpio_device, list); gdev = &next->list == &gpio_devices ? NULL : next; priv->newline = true; ++*pos; return gdev; } static void gpiolib_seq_stop(struct seq_file *s, void *v) { struct gpiolib_seq_priv *priv = s->private; srcu_read_unlock(&gpio_devices_srcu, priv->idx); kfree(priv); } static int gpiolib_seq_show(struct seq_file *s, void *v) { struct gpiolib_seq_priv *priv = s->private; struct gpio_device *gdev = v; struct gpio_chip *gc; struct device *parent; if (priv->newline) seq_putc(s, '\n'); guard(srcu)(&gdev->srcu); gc = srcu_dereference(gdev->chip, &gdev->srcu); if (!gc) { seq_printf(s, "%s: (dangling chip)\n", dev_name(&gdev->dev)); return 0; } seq_printf(s, "%s: GPIOs %u-%u", dev_name(&gdev->dev), gdev->base, gdev->base + gdev->ngpio - 1); parent = gc->parent; if (parent) seq_printf(s, ", parent: %s/%s", parent->bus ? parent->bus->name : "no-bus", dev_name(parent)); if (gc->label) seq_printf(s, ", %s", gc->label); if (gc->can_sleep) seq_printf(s, ", can sleep"); seq_printf(s, ":\n"); if (gc->dbg_show) gc->dbg_show(s, gc); else gpiolib_dbg_show(s, gdev); return 0; } static const struct seq_operations gpiolib_sops = { .start = gpiolib_seq_start, .next = gpiolib_seq_next, .stop = gpiolib_seq_stop, .show = gpiolib_seq_show, }; DEFINE_SEQ_ATTRIBUTE(gpiolib); static int __init gpiolib_debugfs_init(void) { /* /sys/kernel/debug/gpio */ debugfs_create_file("gpio", 0444, NULL, NULL, &gpiolib_fops); return 0; } subsys_initcall(gpiolib_debugfs_init); #endif /* DEBUG_FS */ |
101 64 2 99 4 99 92 51 84 84 84 13 48 85 37 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 | /* mpicoder.c - Coder for the external representation of MPIs * Copyright (C) 1998, 1999 Free Software Foundation, Inc. * * This file is part of GnuPG. * * GnuPG is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * GnuPG is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ #include <linux/bitops.h> #include <linux/count_zeros.h> #include <linux/byteorder/generic.h> #include <linux/scatterlist.h> #include <linux/string.h> #include "mpi-internal.h" #define MAX_EXTERN_MPI_BITS 16384 /** * mpi_read_raw_data - Read a raw byte stream as a positive integer * @xbuffer: The data to read * @nbytes: The amount of data to read */ MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) { const uint8_t *buffer = xbuffer; int i, j; unsigned nbits, nlimbs; mpi_limb_t a; MPI val = NULL; while (nbytes > 0 && buffer[0] == 0) { buffer++; nbytes--; } nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); return NULL; } if (nbytes > 0) nbits -= count_leading_zeros(buffer[0]) - (BITS_PER_LONG - 8); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) return NULL; val->nbits = nbits; val->sign = 0; val->nlimbs = nlimbs; if (nbytes > 0) { i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; i %= BYTES_PER_MPI_LIMB; for (j = nlimbs; j > 0; j--) { a = 0; for (; i < BYTES_PER_MPI_LIMB; i++) { a <<= 8; a |= *buffer++; } i = 0; val->d[j - 1] = a; } } return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_data); MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) { const uint8_t *buffer = xbuffer; unsigned int nbits, nbytes; MPI val; if (*ret_nread < 2) return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); return ERR_PTR(-EINVAL); } nbytes = DIV_ROUND_UP(nbits, 8); if (nbytes + 2 > *ret_nread) { pr_info("MPI: mpi larger than buffer nbytes=%u ret_nread=%u\n", nbytes, *ret_nread); return ERR_PTR(-EINVAL); } val = mpi_read_raw_data(buffer + 2, nbytes); if (!val) return ERR_PTR(-ENOMEM); *ret_nread = nbytes + 2; return val; } EXPORT_SYMBOL_GPL(mpi_read_from_buffer); static int count_lzeros(MPI a) { mpi_limb_t alimb; int i, lzeros = 0; for (i = a->nlimbs - 1; i >= 0; i--) { alimb = a->d[i]; if (alimb == 0) { lzeros += sizeof(mpi_limb_t); } else { lzeros += count_leading_zeros(alimb) / 8; break; } } return lzeros; } /** * mpi_read_buffer() - read MPI to a buffer provided by user (msb first) * * @a: a multi precision integer * @buf: buffer to which the output will be written to. Needs to be at * least mpi_get_size(a) long. * @buf_len: size of the buf. * @nbytes: receives the actual length of the data written on success and * the data to-be-written on -EOVERFLOW in case buf_len was too * small. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign) { uint8_t *p; #if BYTES_PER_MPI_LIMB == 4 __be32 alimb; #elif BYTES_PER_MPI_LIMB == 8 __be64 alimb; #else #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); int i, lzeros; if (!buf || !nbytes) return -EINVAL; if (sign) *sign = a->sign; lzeros = count_lzeros(a); if (buf_len < n - lzeros) { *nbytes = n - lzeros; return -EOVERFLOW; } p = buf; *nbytes = n - lzeros; for (i = a->nlimbs - 1 - lzeros / BYTES_PER_MPI_LIMB, lzeros %= BYTES_PER_MPI_LIMB; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 alimb = cpu_to_be32(a->d[i]); #elif BYTES_PER_MPI_LIMB == 8 alimb = cpu_to_be64(a->d[i]); #else #error please implement for this limb size. #endif memcpy(p, (u8 *)&alimb + lzeros, BYTES_PER_MPI_LIMB - lzeros); p += BYTES_PER_MPI_LIMB - lzeros; lzeros = 0; } return 0; } EXPORT_SYMBOL_GPL(mpi_read_buffer); /* * mpi_get_buffer() - Returns an allocated buffer with the MPI (msb first). * Caller must free the return string. * This function does return a 0 byte buffer with nbytes set to zero if the * value of A is zero. * * @a: a multi precision integer. * @nbytes: receives the length of this buffer. * @sign: if not NULL, it will be set to the sign of the a. * * Return: Pointer to MPI buffer or NULL on error */ void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign) { uint8_t *buf; unsigned int n; int ret; if (!nbytes) return NULL; n = mpi_get_size(a); if (!n) n++; buf = kmalloc(n, GFP_KERNEL); if (!buf) return NULL; ret = mpi_read_buffer(a, buf, n, nbytes, sign); if (ret) { kfree(buf); return NULL; } return buf; } EXPORT_SYMBOL_GPL(mpi_get_buffer); /** * mpi_write_to_sgl() - Funnction exports MPI to an sgl (msb first) * * This function works in the same way as the mpi_read_buffer, but it * takes an sgl instead of u8 * buf. * * @a: a multi precision integer * @sgl: scatterlist to write to. Needs to be at least * mpi_get_size(a) long. * @nbytes: the number of bytes to write. Leading bytes will be * filled with zero. * @sign: if not NULL, it will be set to the sign of a. * * Return: 0 on success or error code in case of error */ int mpi_write_to_sgl(MPI a, struct scatterlist *sgl, unsigned nbytes, int *sign) { u8 *p, *p2; #if BYTES_PER_MPI_LIMB == 4 __be32 alimb; #elif BYTES_PER_MPI_LIMB == 8 __be64 alimb; #else #error please implement for this limb size. #endif unsigned int n = mpi_get_size(a); struct sg_mapping_iter miter; int i, x, buf_len; int nents; if (sign) *sign = a->sign; if (nbytes < n) return -EOVERFLOW; nents = sg_nents_for_len(sgl, nbytes); if (nents < 0) return -EINVAL; sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC | SG_MITER_TO_SG); sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; while (nbytes > n) { i = min_t(unsigned, nbytes - n, buf_len); memset(p2, 0, i); p2 += i; nbytes -= i; buf_len -= i; if (!buf_len) { sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; } } for (i = a->nlimbs - 1; i >= 0; i--) { #if BYTES_PER_MPI_LIMB == 4 alimb = a->d[i] ? cpu_to_be32(a->d[i]) : 0; #elif BYTES_PER_MPI_LIMB == 8 alimb = a->d[i] ? cpu_to_be64(a->d[i]) : 0; #else #error please implement for this limb size. #endif p = (u8 *)&alimb; for (x = 0; x < sizeof(alimb); x++) { *p2++ = *p++; if (!--buf_len) { sg_miter_next(&miter); buf_len = miter.length; p2 = miter.addr; } } } sg_miter_stop(&miter); return 0; } EXPORT_SYMBOL_GPL(mpi_write_to_sgl); /* * mpi_read_raw_from_sgl() - Function allocates an MPI and populates it with * data from the sgl * * This function works in the same way as the mpi_read_raw_data, but it * takes an sgl instead of void * buffer. i.e. it allocates * a new MPI and reads the content of the sgl to the MPI. * * @sgl: scatterlist to read from * @nbytes: number of bytes to read * * Return: Pointer to a new MPI or NULL on error */ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) { struct sg_mapping_iter miter; unsigned int nbits, nlimbs; int x, j, z, lzeros, ents; unsigned int len; const u8 *buff; mpi_limb_t a; MPI val = NULL; ents = sg_nents_for_len(sgl, nbytes); if (ents < 0) return NULL; sg_miter_start(&miter, sgl, ents, SG_MITER_ATOMIC | SG_MITER_FROM_SG); lzeros = 0; len = 0; while (nbytes > 0) { while (len && !*buff) { lzeros++; len--; buff++; } if (len && *buff) break; sg_miter_next(&miter); buff = miter.addr; len = miter.length; nbytes -= lzeros; lzeros = 0; } miter.consumed = lzeros; nbytes -= lzeros; nbits = nbytes * 8; if (nbits > MAX_EXTERN_MPI_BITS) { sg_miter_stop(&miter); pr_info("MPI: mpi too large (%u bits)\n", nbits); return NULL; } if (nbytes > 0) nbits -= count_leading_zeros(*buff) - (BITS_PER_LONG - 8); sg_miter_stop(&miter); nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) return NULL; val->nbits = nbits; val->sign = 0; val->nlimbs = nlimbs; if (nbytes == 0) return val; j = nlimbs - 1; a = 0; z = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; z %= BYTES_PER_MPI_LIMB; while (sg_miter_next(&miter)) { buff = miter.addr; len = min_t(unsigned, miter.length, nbytes); nbytes -= len; for (x = 0; x < len; x++) { a <<= 8; a |= *buff++; if (((z + x + 1) % BYTES_PER_MPI_LIMB) == 0) { val->d[j--] = a; a = 0; } } z += x; } return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl); |
4378 439 114 97 88 15 8749 8761 8759 4415 4946 15 1365 1359 1 1377 4628 523 30 1103 1 1189 1344 100 165 371 436 1178 1372 31 1400 39 39 2 31 38 38 1662 1659 1659 456 455 1461 2 177 250 14 13 21 1482 31 506 2 1306 100 1514 7 1522 1518 102 343 64 1976 2136 842 823 33 32 17 16 18 695 662 28 674 25 27 14 9 2 1 2 3 2 9 567 1657 2 1657 5 1616 65 1659 1657 1657 1662 1662 4 1657 1770 1769 1775 1772 1771 1733 136 1769 1730 137 1771 64 51 1774 1773 1775 2 2 2 1768 1771 1777 36 1774 7 1772 1773 1772 19 1767 36 19 1769 1714 323 1773 1772 1773 1777 1053 1081 1773 1771 8 1776 1774 19 648 88 1726 1771 1774 1 1771 4 1770 1772 10 1773 3 1769 1 1726 138 2 1771 2 1773 1773 10 1771 7 13 1774 9 1 4 9 5 6 8 7 7 5 7 7 7 5 7 10 8 9 1 12 6 15 10 1742 119 348 1553 1 6 75 21 1772 1521 1773 1775 1769 16 1769 90 13 1 1626 262 263 2 2 1773 119 1 1 2 119 1 121 121 121 10 16 1 5 5 2 3 122 134 133 5 13 121 121 2 2 18 2 4 23 1 115 121 121 121 121 123 1 124 16 1 2 35 2 5 5 1437 2 1445 1444 3 1441 8 1594 2 1 1294 1291 1294 1297 1 1295 2 2 1292 1 1290 3 1287 1 1239 36 1 37 38 2 62 2 2 56 1 2 40 2 6 4 2 3 2 33 2 2 3 25 3 3 21 1 2 16 1 3 11 1 2 8 1 2 1 1 3 2 1 239 1 19 21 55 53 138 16 5 2 1 3 5 3 6 2 6 447 19 78 354 432 431 1 395 17 21 373 8 41 361 51 403 5 6 373 34 403 4 154 258 288 321 89 374 1 5 365 369 1 5 368 1 6 372 1 3 372 1 372 1 2 371 5 367 7 294 4 7 3 66 3 294 2 290 4 261 27 24 4 271 16 246 2 1 15 2 13 3 17 1 5 134 124 213 242 100 13 13 73 2 4 68 2 63 53 1 9 4 61 1 1 1 1 2 84 101 2 5 1 91 2 84 15 4 86 3 13 2 85 74 290 315 393 44 560 865 5 742 123 6 741 122 2 2 860 9 816 4 31 813 36 842 6 847 3 846 4 846 3 848 3 843 4 842 6 845 2 844 1 847 1 5 2 269 14 176 312 3 1 29 33 1 52 867 866 41 821 17 848 1 363 565 552 401 156 137 1 21 21 1 7 11 18 15 456 893 6 53 4 8 865 1537 3 2 1534 1 316 2 1101 29 29 69 1123 49 1141 1002 40 5 1 3 1377 1373 1 1377 1369 1366 1188 2 18 4 4 4 8 8 5 5 35 35 22 2 11 2 9 2 6 2 3 3 6 11 2 2 5 5 6 30 1 8 2 8 1 10 3 2 10 2 10 3 4 21 10 72 71 6 40 5 5 57 57 26 57 27 10 57 1647 1563 177 1648 338 1502 46 1605 1456 1455 2 2 20 5 1 19 1 1 12 12 15 2 1 1 7 4 11 16 52 2 5 3 52 1 5 3 2 2 1 46 30 11 6 6 15 23 6 2 3 1 37 17 20 2 2 3 16 2 2 13 1 18 2 7 6 3 3 3 1 7 12 4 2 8 2 6 8 8 5 8 8 2 5 1 6 2 9 9 1 1 3 2 2 4 2 1 2 5 32 19 14 18 8 1 6 13 12 12 7 5 5 4 7 5 7 7 6 2 6 6 2 3 15 15 1 3 1 8 1 2 28 28 2 2 1 1 1 1 2 1 1 1 2 2 2 1 1 6 1 5 15 3 11 9 6 1 11 22 22 12 15 15 1 14 15 1 14 2 2 97 1 21 11 82 2 80 80 7 71 9 9 1 8 1 9 10 62 1 10 5 54 1 6 7 44 9 10 1 1 1 7 6 20 10 10 10 10 1 1 2 1 11 3 11 1 11 4 1 10 11 1 10 11 1 4 1 4 23 23 2 16 9 15 23 10 1 9 23 12 4 1 6 23 11 23 14 14 1 5 14 11 11 5 11 4 4 3 4 5 1 6 5 5 6 6 6 2 26 4 19 2 4 28 6 6 5 3 21 21 2 2 1 2 10 22 8 3 2 12 12 12 1 1 1 17 17 2 2 1 1 5 1 2 1 1 2 2 1 2 7 3 5 4 2 4 4 7 1 1 1 1 3 12 1 7 1 3 1 37 1 3 2 1 6 1 8 2 12 2 1 1 26 1 36 1 12 3 1 1 1 18 8 1 1 1 4 1 24 13 11 8 2 3 1 1 2 1 6 462 76 439 24 305 23 132 83 100 433 5122 2 1 1 4218 481 436 433 35 35 361 72 332 434 4706 4700 263 263 4696 2 2238 2573 2576 5144 6 3 4 1692 1678 607 47 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Routing netlink socket interface: protocol independent part. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> #include <linux/bpf.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/udp.h> #include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/devlink.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/addrconf.h> #endif #include <linux/dpll.h> #include "dev.h" #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; struct module *owner; unsigned int flags; struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock); int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { if (head && tail) { tail->next = defer_kfree_skb_list; defer_kfree_skb_list = head; } } EXPORT_SYMBOL(rtnl_kfree_skbs); void __rtnl_unlock(void) { struct sk_buff *head = defer_kfree_skb_list; defer_kfree_skb_list = NULL; /* Ensure that we didn't actually add any TODO item when __rtnl_unlock() * is used. In some places, e.g. in cfg80211, we have code that will do * something like * rtnl_lock() * wiphy_lock() * ... * rtnl_unlock() * * and because netdev_run_todo() acquires the RTNL for items on the list * we could cause a situation such as this: * Thread 1 Thread 2 * rtnl_lock() * unregister_netdevice() * __rtnl_unlock() * rtnl_lock() * wiphy_lock() * rtnl_unlock() * netdev_run_todo() * __rtnl_unlock() * * // list not empty now * // because of thread 2 * rtnl_lock() * while (!list_empty(...)) * rtnl_lock() * wiphy_lock() * **** DEADLOCK **** * * However, usage of __rtnl_unlock() is rare, and so we can ensure that * it's not used in cases where something is added to do the list. */ WARN_ON(!list_empty(&net_todo_list)); mutex_unlock(&rtnl_mutex); while (head) { struct sk_buff *next = head->next; kfree_skb(head); cond_resched(); head = next; } } void rtnl_unlock(void) { /* This fellow will unlock it for us. */ netdev_run_todo(); } EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_is_locked); bool refcount_dec_and_rtnl_lock(refcount_t *r) { return refcount_dec_and_mutex_lock(r, &rtnl_mutex); } EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ #ifdef CONFIG_DEBUG_NET_SMALL_RTNL void __rtnl_net_lock(struct net *net) { ASSERT_RTNL(); mutex_lock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_lock); void __rtnl_net_unlock(struct net *net) { ASSERT_RTNL(); mutex_unlock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_unlock); void rtnl_net_lock(struct net *net) { rtnl_lock(); __rtnl_net_lock(net); } EXPORT_SYMBOL(rtnl_net_lock); void rtnl_net_unlock(struct net *net) { __rtnl_net_unlock(net); rtnl_unlock(); } EXPORT_SYMBOL(rtnl_net_unlock); int rtnl_net_trylock(struct net *net) { int ret = rtnl_trylock(); if (ret) __rtnl_net_lock(net); return ret; } EXPORT_SYMBOL(rtnl_net_trylock); static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) return 0; /* always init_net first */ if (net_eq(net_a, &init_net)) return -1; if (net_eq(net_b, &init_net)) return 1; /* otherwise lock in ascending order */ return net_a < net_b ? -1 : 1; } int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { const struct net *net_a, *net_b; net_a = container_of(a, struct net, rtnl_mutex.dep_map); net_b = container_of(b, struct net, rtnl_mutex.dep_map); return rtnl_net_cmp_locks(net_a, net_b); } bool rtnl_net_is_locked(struct net *net) { return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); } EXPORT_SYMBOL(rtnl_net_is_locked); bool lockdep_rtnl_net_is_held(struct net *net) { return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_net_is_held); #else static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { /* No need to swap */ return -1; } #endif struct rtnl_nets { /* ->newlink() needs to freeze 3 netns at most; * 2 for the new device, 1 for its peer. */ struct net *net[3]; unsigned char len; }; static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) { memset(rtnl_nets, 0, sizeof(*rtnl_nets)); } static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) { put_net(rtnl_nets->net[i]); rtnl_nets->net[i] = NULL; } rtnl_nets->len = 0; } /** * rtnl_nets_add - Add netns to be locked before ->newlink(). * * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). * @net: netns pointer with an extra refcnt held. * * The extra refcnt is released in rtnl_nets_destroy(). */ static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) { int i; DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); for (i = 0; i < rtnl_nets->len; i++) { switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { case 0: put_net(net); return; case 1: swap(rtnl_nets->net[i], net); } } rtnl_nets->net[i] = net; rtnl_nets->len++; } static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) { int i; rtnl_lock(); for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_lock(rtnl_nets->net[i]); } static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) { int i; for (i = 0; i < rtnl_nets->len; i++) __rtnl_net_unlock(rtnl_nets->net[i]); rtnl_unlock(); } static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { int msgindex = msgtype - RTM_BASE; /* * msgindex < 0 implies someone tried to register a netlink * control code. msgindex >= RTM_NR_MSGTYPES may indicate that * the message type has not been added to linux/rtnetlink.h */ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); return msgindex; } static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { struct rtnl_link *link, *old; struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) goto unlock; /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } old = rtnl_dereference(tab[msgindex]); if (old) { link = kmemdup(old, sizeof(*old), GFP_KERNEL); if (!link) goto unlock; } else { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) goto unlock; } WARN_ON(link->owner && link->owner != owner); link->owner = owner; WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) link->dumpit = dumpit; WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL && (flags & RTNL_FLAG_BULK_DEL_SUPPORTED)); link->flags |= flags; /* publish protocol:msgtype */ rcu_assign_pointer(tab[msgindex], link); ret = 0; if (old) kfree_rcu(old, rcu); unlock: rtnl_unlock(); return ret; } /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return -ENOENT; } link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * * Identical to calling rtnl_unregster() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL); if (!tab) { rtnl_unlock(); return; } for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = rcu_replace_pointer_rtnl(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); synchronize_net(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); /** * __rtnl_register_many - Register rtnetlink message types * @handlers: Array of struct rtnl_msg_handlers * @n: The length of @handlers * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the * specified protocol family and message type is received. * * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. * * When one element of @handlers fails to register, * 1) built-in: panics. * 2) modules : the previous successful registrations are unwinded * and an error is returned. * * Use rtnl_register_many(). */ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i, err; for (i = 0, handler = handlers; i < n; i++, handler++) { err = rtnl_register_internal(handler->owner, handler->protocol, handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { if (!handler->owner) panic("Unable to register rtnetlink message " "handlers, %pS\n", handlers); __rtnl_unregister_many(handlers, i); break; } } return err; } EXPORT_SYMBOL_GPL(__rtnl_register_many); void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i; for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) rtnl_unregister(handler->protocol, handler->msgtype); } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { struct rtnl_link_ops *ops; rcu_read_lock(); list_for_each_entry_rcu(ops, &link_ops, list) { if (!strcmp(ops->kind, kind)) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * Returns 0 on success or a negative error code. */ int rtnl_link_register(struct rtnl_link_ops *ops) { struct rtnl_link_ops *tmp; int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; err = init_srcu_struct(&ops->srcu); if (err) return err; mutex_lock(&link_ops_mutex); list_for_each_entry(tmp, &link_ops, list) { if (!strcmp(ops->kind, tmp->kind)) { err = -EEXIST; goto unlock; } } list_add_tail_rcu(&ops->list, &link_ops); unlock: mutex_unlock(&link_ops_mutex); return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; LIST_HEAD(list_kill); for_each_netdev(net, dev) { if (dev->rtnl_link_ops == ops) ops->dellink(dev, &list_kill); } unregister_netdevice_many(&list_kill); } /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; mutex_lock(&link_ops_mutex); list_del_rcu(&ops->list); mutex_unlock(&link_ops_mutex); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); for_each_net(net) __rtnl_kill_links(net, ops); rtnl_unlock(); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; size_t size = 0; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) goto out; ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); return size; } static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; size_t size; if (!ops) return 0; size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); size += rtnl_link_get_slave_info_data_size(dev); return size; } static LIST_HEAD(rtnl_af_ops); static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { struct rtnl_af_ops *ops; ASSERT_RTNL(); rcu_read_lock(); list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { if (ops->family == family) { *srcu_index = srcu_read_lock(&ops->srcu); goto unlock; } } ops = NULL; unlock: rcu_read_unlock(); return ops; } static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) { srcu_read_unlock(&ops->srcu, srcu_index); } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Return: 0 on success or a negative error code. */ int rtnl_af_register(struct rtnl_af_ops *ops) { int err = init_srcu_struct(&ops->srcu); if (err) return err; rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); list_del_rcu(&ops->list); rtnl_unlock(); synchronize_rcu(); synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); static size_t rtnl_link_get_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } rcu_read_unlock(); return size; } static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; bool ret = false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) ret = true; rcu_read_unlock(); return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; struct nlattr *slave_data; int err; master_dev = netdev_master_upper_dev_get((struct net_device *) dev); if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); if (err < 0) goto err_cancel_slave_data; nla_nest_end(skb, slave_data); } return 0; err_cancel_slave_data: nla_nest_cancel(skb, slave_data); return err; } static int rtnl_link_info_fill(struct sk_buff *skb, const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; struct nlattr *data; int err; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) return err; } if (ops->fill_info) { data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } return 0; err_cancel_data: nla_nest_cancel(skb, data); return err; } static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *linkinfo; int err = -EMSGSIZE; linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; err = rtnl_link_info_fill(skb, dev); if (err < 0) goto err_cancel_link; err = rtnl_link_slave_info_fill(skb, dev); if (err < 0) goto err_cancel_link; nla_nest_end(skb, linkinfo); return 0; err_cancel_link: nla_nest_cancel(skb, linkinfo); out: return err; } int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { struct sock *rtnl = net->rtnl; return nlmsg_unicast(rtnl, skb, pid); } EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { struct sock *rtnl = net->rtnl; netlink_set_err(rtnl, 0, group, error); } EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct nlattr *mx; int i, valid = 0; /* nothing is dumped for dst_default_metrics, so just skip the loop */ if (metrics == dst_default_metrics.metrics) return 0; mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { if (i == RTAX_CC_ALGO - 1) { char tmp[TCP_CA_NAME_MAX], *name; name = tcp_ca_get_name_by_key(metrics[i], tmp); if (!name) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; if (!user_features) continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; } valid++; } } if (!valid) { nla_nest_cancel(skb, mx); return 0; } return nla_nest_end(skb, mx); nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { .rta_error = error, .rta_id = id, }; if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } if (expires) { unsigned long clock; clock = jiffies_to_clock_t(abs(expires)); clock = min_t(unsigned long, clock, INT_MAX); ci.rta_expires = (expires > 0) ? clock : -clock; } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); void netdev_set_operstate(struct net_device *dev, int newstate) { unsigned int old = READ_ONCE(dev->operstate); do { if (old == newstate) return; } while (!try_cmpxchg(&dev->operstate, &old, newstate)); netdev_state_change(dev); } EXPORT_SYMBOL(netdev_set_operstate); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; case IF_OPER_TESTING: if (netif_oper_up(dev)) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } netdev_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) { return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); } static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int flags = ifm->ifi_flags; /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; a->rx_bytes = b->rx_bytes; a->tx_bytes = b->tx_bytes; a->rx_errors = b->rx_errors; a->tx_errors = b->tx_errors; a->rx_dropped = b->rx_dropped; a->tx_dropped = b->tx_dropped; a->multicast = b->multicast; a->collisions = b->collisions; a->rx_length_errors = b->rx_length_errors; a->rx_over_errors = b->rx_over_errors; a->rx_crc_errors = b->rx_crc_errors; a->rx_frame_errors = b->rx_frame_errors; a->rx_fifo_errors = b->rx_fifo_errors; a->rx_missed_errors = b->rx_missed_errors; a->tx_aborted_errors = b->tx_aborted_errors; a->tx_carrier_errors = b->tx_carrier_errors; a->tx_fifo_errors = b->tx_fifo_errors; a->tx_heartbeat_errors = b->tx_heartbeat_errors; a->tx_window_errors = b->tx_window_errors; a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + nla_total_size(sizeof(struct ifla_vf_trust))); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { size += num_vfs * (nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_DROPPED */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } return size; } else return 0; } static size_t rtnl_port_size(const struct net_device *dev, u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ + nla_total_size(2); /* PORT_VDP_RESPONSE */ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + port_size; size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + vf_port_size * dev_num_vf(dev->dev.parent); else return port_self_size; } static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; unsigned int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(name_node, &dev->name_node->list, list) cnt++; rcu_read_unlock(); if (!cnt) return 0; return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); /* Assume dev->proto_down_reason is not zero. */ size += nla_total_size(0) + nla_total_size(4); return size; } static size_t rtnl_devlink_port_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ if (dev->devlink_port) size += devlink_nl_port_handle_size(dev->devlink_port); return size; } static size_t rtnl_dpll_pin_size(const struct net_device *dev) { size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */ size += dpll_netdev_pin_handle_size(dev); return size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ + nla_total_size(4) /* IFLA_WEIGHT */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *vf_ports; struct nlattr *vf_port; int vf; int err; vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) goto nla_put_failure; err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); if (err == -EMSGSIZE) goto nla_put_failure; if (err) { nla_nest_cancel(skb, vf_port); continue; } nla_nest_end(skb, vf_port); } nla_nest_end(skb, vf_ports); return 0; nla_put_failure: nla_nest_cancel(skb, vf_ports); return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *port_self; int err; port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); return 0; } static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { int err; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); if (err) return err; if (dev_num_vf(dev->dev.parent)) { err = rtnl_vf_ports_fill(skb, dev); if (err) return err; } return 0; } static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) { char name[IFNAMSIZ]; int err; err = dev_get_phys_port_name(dev, name, sizeof(name)); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; } static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { struct netdev_phys_item_id ppid = { }; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_stats64 *sp; struct nlattr *attr; attr = nla_reserve_64bit(skb, IFLA_STATS64, sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; sp = nla_data(attr); dev_get_stats(dev, sp); attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; copy_rtnl_link_stats(nla_data(attr), sp); return 0; } static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; struct ifla_vf_guid node_guid; struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver * didn't report anything. */ ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; /* VLAN Protocol by default is 802.1Q */ ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); memset(&node_guid, 0, sizeof(node_guid)); memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = vf_trust.vf = node_guid.vf = port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; vf_vlan_info.qos = ivi.qos; vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), &node_guid) || nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), &port_guid)) goto nla_put_vf_failure; } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), &vf_vlan_info)) { nla_nest_cancel(skb, vfvlanlist); goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, vf_stats.tx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, vf_stats.multicast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } nla_nest_end(skb, vfstats); } nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); return -EMSGSIZE; } static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { struct nlattr *vfinfo; int i, num_vfs; if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) return 0; num_vfs = dev_num_vf(dev->dev.parent); if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) return -EMSGSIZE; if (!dev->netdev_ops->ndo_get_vf_config) return 0; vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } } nla_nest_end(skb, vfinfo); return 0; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, const struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); map.mem_start = READ_ONCE(dev->mem_start); map.mem_end = READ_ONCE(dev->mem_end); map.base_addr = READ_ONCE(dev->base_addr); map.irq = READ_ONCE(dev->irq); map.dma = READ_ONCE(dev->dma); map.port = READ_ONCE(dev->if_port); if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; } static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; u32 res = 0; rcu_read_lock(); generic_xdp_prog = rcu_dereference(dev->xdp_prog); if (generic_xdp_prog) res = generic_xdp_prog->aux->id; rcu_read_unlock(); return res; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, u32 (*get_prog_id)(struct net_device *dev)) { u32 curr_id; int err; curr_id = get_prog_id(dev); if (!curr_id) return 0; *prog_id = curr_id; err = nla_put_u32(skb, attr, curr_id); if (err) return err; if (*mode != XDP_ATTACHED_NONE) *mode = XDP_ATTACHED_MULTI; else *mode = tgt_mode; return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; u32 prog_id; int err; u8 mode; xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; prog_id = 0; mode = XDP_ATTACHED_NONE; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; } nla_nest_end(skb, xdp); return 0; err_cancel: nla_nest_cancel(skb, xdp); return err; } static u32 rtnl_get_event(unsigned long event) { u32 rtnl_event_type = IFLA_EVENT_NONE; switch (event) { case NETDEV_REBOOT: rtnl_event_type = IFLA_EVENT_REBOOT; break; case NETDEV_FEAT_CHANGE: rtnl_event_type = IFLA_EVENT_FEATURES; break; case NETDEV_BONDING_FAILOVER: rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; break; case NETDEV_NOTIFY_PEERS: rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; break; case NETDEV_RESEND_IGMP: rtnl_event_type = IFLA_EVENT_IGMP_RESEND; break; case NETDEV_CHANGEINFODATA: rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; break; default: break; } return rtnl_event_type; } static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) { const struct net_device *upper_dev; int ret = 0; rcu_read_lock(); upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) ret = nla_put_u32(skb, IFLA_MASTER, READ_ONCE(upper_dev->ifindex)); rcu_read_unlock(); return ret; } static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { int iflink = dev_get_iflink(dev); if (force || READ_ONCE(dev->ifindex) != iflink) return nla_put_u32(skb, IFLA_LINK, iflink); return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev) { char buf[IFALIASZ]; int ret; ret = dev_get_alias(dev, buf, sizeof(buf)); return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; } static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net, gfp_t gfp) { bool put_iflink = false; if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; put_iflink = true; } } return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; if (!af_ops->fill_link_af) continue; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there * was no data to be dumped. This is not an error, it * means we should trim the attribute header and * continue. */ if (err == -ENODATA) nla_nest_cancel(skb, af); else if (err < 0) return -EMSGSIZE; nla_nest_end(skb, af); } nla_nest_end(skb, af_spec); return 0; } static int rtnl_fill_alt_ifnames(struct sk_buff *skb, const struct net_device *dev) { struct netdev_name_node *name_node; int count = 0; list_for_each_entry_rcu(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; } return count; } /* RCU protected. */ static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *prop_list; int ret; prop_list = nla_nest_start(skb, IFLA_PROP_LIST); if (!prop_list) return -EMSGSIZE; ret = rtnl_fill_alt_ifnames(skb, dev); if (ret <= 0) goto nest_cancel; nla_nest_end(skb, prop_list); return 0; nest_cancel: nla_nest_cancel(skb, prop_list); return ret; } static int rtnl_fill_proto_down(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *pr; u32 preason; if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down))) goto nla_put_failure; preason = READ_ONCE(dev->proto_down_reason); if (!preason) return 0; pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); if (!pr) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { nla_nest_cancel(skb, pr); goto nla_put_failure; } nla_nest_end(skb, pr); return 0; nla_put_failure: return -EMSGSIZE; } static int rtnl_fill_devlink_port(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *devlink_port_nest; int ret; devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); if (!devlink_port_nest) return -EMSGSIZE; if (dev->devlink_port) { ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); if (ret < 0) goto nest_cancel; } nla_nest_end(skb, devlink_port_nest); return 0; nest_cancel: nla_nest_cancel(skb, devlink_port_nest); return ret; } static int rtnl_fill_dpll_pin(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *dpll_pin_nest; int ret; dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN); if (!dpll_pin_nest) return -EMSGSIZE; ret = dpll_netdev_add_pin_handle(skb, dev); if (ret < 0) goto nest_cancel; nla_nest_end(skb, dpll_pin_nest); return 0; nest_cancel: nla_nest_cancel(skb, dpll_pin_nest); return ret; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { char devname[IFNAMSIZ]; struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; ifm->ifi_type = READ_ONCE(dev->type); ifm->ifi_index = READ_ONCE(dev->ifindex); ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; netdev_copy_name(dev, devname); if (nla_put_string(skb, IFLA_IFNAME, devname)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) || nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) || nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, READ_ONCE(dev->num_tx_queues)) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, READ_ONCE(dev->gso_max_segs)) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, READ_ONCE(dev->gso_max_size)) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, READ_ONCE(dev->gro_max_size)) || nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE, READ_ONCE(dev->gso_ipv4_max_size)) || nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE, READ_ONCE(dev->gro_ipv4_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SIZE, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) goto nla_put_failure; } if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; if (new_ifindex && nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC)) goto nla_put_failure_rcu; qdisc = rcu_dereference(dev->qdisc); if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) goto nla_put_failure_rcu; if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure_rcu; if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure_rcu; rcu_read_unlock(); if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, dev_name(dev->dev.parent))) goto nla_put_failure; if (dev->dev.parent && dev->dev.parent->bus && nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, dev->dev.parent->bus->name)) goto nla_put_failure; if (rtnl_fill_devlink_port(skb, dev)) goto nla_put_failure; if (rtnl_fill_dpll_pin(skb, dev)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to * allow 0-length string (needed to remove an alias). */ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, [IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, /* Unused, but we need to keep it here since user space could * fill it. It's also broken with regard to NLA_BINARY use in * combination with structs. */ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, int *ops_srcu_index) { struct nlattr *linfo[IFLA_INFO_MAX + 1]; struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; } static bool link_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need * another invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool link_kind_filtered(const struct net_device *dev, const struct rtnl_link_ops *kind_ops) { if (kind_ops && dev->rtnl_link_ops != kind_ops) return true; return false; } static bool link_dump_filtered(struct net_device *dev, int master_idx, const struct rtnl_link_ops *kind_ops) { if (link_master_filtered(dev, master_idx) || link_kind_filtered(dev, kind_ops)) return true; return false; } /** * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. * @sk: netlink socket * @netnsid: network namespace identifier * * Returns the network namespace identified by netnsid on success or an error * pointer on failure. */ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } return net; } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, bool strict_check, struct nlattr **tb, struct netlink_ext_ack *extack) { int hdrlen; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); return -EINVAL; } if (ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); return -EINVAL; } return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); } /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's * what iproute2 < v3.9.0 used. * We can detect the old iproute2. Even including the IFLA_EXT_MASK * attribute, its netlink message is shorter than struct ifinfomsg. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; struct { unsigned long ifindex; } *ctx = (void *)cb->ctx; struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) return err; goto walk_entries; } for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; /* new attributes should only be added with strict checking */ switch (i) { case IFLA_TARGET_NETNSID: netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); err = PTR_ERR(tgt_net); netnsid = -1; goto out; } break; case IFLA_EXT_MASK: ext_filter_mask = nla_get_u32(tb[i]); break; case IFLA_MASTER: master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); err = -EINVAL; goto out; } } } if (master_idx || kind_ops) flags |= NLM_F_DUMP_FILTERED; walk_entries: err = 0; for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { if (link_dump_filtered(dev, master_idx, kind_ops)) continue; err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) break; } cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); out: if (kind_ops) rtnl_link_ops_put(kind_ops, ops_srcu_index); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr) { const struct ifinfomsg *ifmp; const struct nlattr *attrs; size_t len; ifmp = nla_data(nla_peer); attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); len = nla_len(nla_peer) - sizeof(struct ifinfomsg); if (ifmp->ifi_index < 0) { NL_SET_ERR_MSG_ATTR(exterr, nla_peer, "ifindex can't be negative"); return -EINVAL; } return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { struct net *net = NULL; /* Examine the link attributes and figure out which * network namespace we are talking about. */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); return net; } struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net = rtnl_link_get_net_ifla(tb); if (!net) net = get_net(src_net); return net; } EXPORT_SYMBOL(rtnl_link_get_net); /* Figure out which network namespace we are talking about by * examining the link attributes in the following order: * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) { struct net *net; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); return net; } static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) { struct net *net; net = rtnl_link_get_net_by_nlattr(src_net, tb); if (IS_ERR(net)) return net; if (!netlink_ns_capable(skb, net->user_ns, cap)) { put_net(net); return ERR_PTR(-EPERM); } return net; } /* Verify that rtnetlink requests do not pass additional properties * potentially referring to different network namespaces. */ static int rtnl_ensure_unique_netns(struct nlattr *tb[], struct netlink_ext_ack *extack, bool netns_id_only) { if (netns_id_only) { if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) return 0; NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); return -EOPNOTSUPP; } if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; invalid_attr: NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); return -EINVAL; } static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_set_vf_rate) return -EOPNOTSUPP; if (max_tx_rate && max_tx_rate < min_tx_rate) return -EINVAL; return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); } static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS] && nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) return -EINVAL; if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; if (tb[IFLA_GSO_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_max_size"); return -EINVAL; } if (tb[IFLA_GSO_MAX_SEGS] && (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS || nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) { NL_SET_ERR_MSG(extack, "too big gso_max_segs"); return -EINVAL; } if (tb[IFLA_GRO_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_max_size"); return -EINVAL; } if (tb[IFLA_GSO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) { NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_GRO_IPV4_MAX_SIZE] && nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) { NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size"); return -EINVAL; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { struct rtnl_af_ops *af_ops; int af_ops_srcu_index; af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) err = -EOPNOTSUPP; else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); else err = 0; rtnl_af_put(af_ops, af_ops_srcu_index); if (err < 0) return err; } } return 0; } static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { const struct net_device_ops *ops = dev->netdev_ops; return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); } static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { if (dev->type != ARPHRD_INFINIBAND) return -EOPNOTSUPP; return handle_infiniband_guid(dev, ivt, guid_type); } static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; int err = -EINVAL; if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); if (ivm->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); if (err < 0) return err; } if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); if (ivv->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, ivv->qos, htons(ETH_P_8021Q)); if (err < 0) return err; } if (tb[IFLA_VF_VLAN_LIST]) { struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; struct nlattr *attr; int rem, len = 0; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_vlan) return err; nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) return -EOPNOTSUPP; ivvl[len] = nla_data(attr); len++; } if (len == 0) return -EINVAL; if (ivvl[0]->vf >= INT_MAX) return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); if (err < 0) return err; err = rtnl_set_vf_rate(dev, ivt->vf, ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); if (ivt->vf >= INT_MAX) return -EINVAL; err = rtnl_set_vf_rate(dev, ivt->vf, ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); if (ivs->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, ivs->setting); if (err < 0) retu |