Total coverage: 268153 (18%)of 1572246
9 1 1 5 8 6 3 9 9 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 // SPDX-License-Identifier: GPL-2.0-only /* * symlink.c * * PURPOSE * Symlink handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998-2001 Ben Fennema * (C) 1999 Stelias Computing Inc * * HISTORY * * 04/16/99 blf Created. * */ #include "udfdecl.h" #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/pagemap.h> #include "udf_i.h" static int udf_pc_to_char(struct super_block *sb, unsigned char *from, int fromlen, unsigned char *to, int tolen) { struct pathComponent *pc; int elen = 0; int comp_len; unsigned char *p = to; /* Reserve one byte for terminating \0 */ tolen--; while (elen < fromlen) { pc = (struct pathComponent *)(from + elen); elen += sizeof(struct pathComponent); switch (pc->componentType) { case 1: /* * Symlink points to some place which should be agreed * upon between originator and receiver of the media. Ignore. */ if (pc->lengthComponentIdent > 0) { elen += pc->lengthComponentIdent; break; } fallthrough; case 2: if (tolen == 0) return -ENAMETOOLONG; p = to; *p++ = '/'; tolen--; break; case 3: if (tolen < 3) return -ENAMETOOLONG; memcpy(p, "../", 3); p += 3; tolen -= 3; break; case 4: if (tolen < 2) return -ENAMETOOLONG; memcpy(p, "./", 2); p += 2; tolen -= 2; /* that would be . - just ignore */ break; case 5: elen += pc->lengthComponentIdent; if (elen > fromlen) return -EIO; comp_len = udf_get_filename(sb, pc->componentIdent, pc->lengthComponentIdent, p, tolen); if (comp_len < 0) return comp_len; p += comp_len; tolen -= comp_len; if (tolen == 0) return -ENAMETOOLONG; *p++ = '/'; tolen--; break; } } if (p > to + 1) p[-1] = '\0'; else p[0] = '\0'; return 0; } static int udf_symlink_filler(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; int err = 0; unsigned char *p = folio_address(folio); struct udf_inode_info *iinfo = UDF_I(inode); /* We don't support symlinks longer than one block */ if (inode->i_size > inode->i_sb->s_blocksize) { err = -ENAMETOOLONG; goto out; } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { symlink = iinfo->i_data + iinfo->i_lenEAttr; } else { bh = udf_bread(inode, 0, 0, &err); if (!bh) { if (!err) err = -EFSCORRUPTED; goto out; } symlink = bh->b_data; } err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); brelse(bh); out: folio_end_read(folio, err == 0); return err; } static int udf_symlink_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct inode *inode = d_backing_inode(dentry); struct folio *folio; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); folio = read_mapping_folio(inode->i_mapping, 0, NULL); if (IS_ERR(folio)) return PTR_ERR(folio); /* * UDF uses non-trivial encoding of symlinks so i_size does not match * number of characters reported by readlink(2) which apparently some * applications expect. Also POSIX says that "The value returned in the * st_size field shall be the length of the contents of the symbolic * link, and shall not count a trailing null if one is present." So * let's report the length of string returned by readlink(2) for * st_size. */ stat->size = strlen(folio_address(folio)); folio_put(folio); return 0; } /* * symlinks can't do much... */ const struct address_space_operations udf_symlink_aops = { .read_folio = udf_symlink_filler, }; const struct inode_operations udf_symlink_inode_operations = { .get_link = page_get_link, .getattr = udf_symlink_getattr, };
1 8 17 16 1 12 11 1 1 1 1 1 1 1 8 8 5 4 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #include "gateway_client.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/udp.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "routing.h" #include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp * packet starting at the beginning of the dhcp header */ #define BATADV_DHCP_HTYPE_OFFSET 1 #define BATADV_DHCP_HLEN_OFFSET 2 /* Value of htype representing Ethernet */ #define BATADV_DHCP_HTYPE_ETHERNET 0x01 /* This is the offset of the "chaddr" field in the dhcp packet starting at the * beginning of the dhcp header */ #define BATADV_DHCP_CHADDR_OFFSET 28 /** * batadv_gw_node_release() - release gw_node from lists and queue for free * after rcu grace period * @ref: kref pointer of the gw_node */ void batadv_gw_node_release(struct kref *ref) { struct batadv_gw_node *gw_node; gw_node = container_of(ref, struct batadv_gw_node, refcount); batadv_orig_node_put(gw_node->orig_node); kfree_rcu(gw_node, rcu); } /** * batadv_gw_get_selected_gw_node() - Get currently selected gateway * @bat_priv: the bat priv with all the soft interface information * * Return: selected gateway (with increased refcnt), NULL on errors */ struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; rcu_read_lock(); gw_node = rcu_dereference(bat_priv->gw.curr_gw); if (!gw_node) goto out; if (!kref_get_unless_zero(&gw_node->refcount)) gw_node = NULL; out: rcu_read_unlock(); return gw_node; } /** * batadv_gw_get_selected_orig() - Get originator of currently selected gateway * @bat_priv: the bat priv with all the soft interface information * * Return: orig_node of selected gateway (with increased refcnt), NULL on errors */ struct batadv_orig_node * batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; struct batadv_orig_node *orig_node = NULL; gw_node = batadv_gw_get_selected_gw_node(bat_priv); if (!gw_node) goto out; rcu_read_lock(); orig_node = gw_node->orig_node; if (!orig_node) goto unlock; if (!kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; unlock: rcu_read_unlock(); out: batadv_gw_node_put(gw_node); return orig_node; } static void batadv_gw_select(struct batadv_priv *bat_priv, struct batadv_gw_node *new_gw_node) { struct batadv_gw_node *curr_gw_node; spin_lock_bh(&bat_priv->gw.list_lock); if (new_gw_node) kref_get(&new_gw_node->refcount); curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node, true); batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } /** * batadv_gw_reselect() - force a gateway reselection * @bat_priv: the bat priv with all the soft interface information * * Set a flag to remind the GW component to perform a new gateway reselection. * However this function does not ensure that the current gateway is going to be * deselected. The reselection mechanism may elect the same gateway once again. * * This means that invoking batadv_gw_reselect() does not guarantee a gateway * change and therefore a uevent is not necessarily expected. */ void batadv_gw_reselect(struct batadv_priv *bat_priv) { atomic_set(&bat_priv->gw.reselect, 1); } /** * batadv_gw_check_client_stop() - check if client mode has been switched off * @bat_priv: the bat priv with all the soft interface information * * This function assumes the caller has checked that the gw state *is actually * changing*. This function is not supposed to be called when there is no state * change. */ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw; if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) return; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!curr_gw) return; /* deselect the current gateway so that next time that client mode is * enabled a proper GW_ADD event can be sent */ batadv_gw_select(bat_priv, NULL); /* if batman-adv is switching the gw client mode off and a gateway was * already selected, send a DEL uevent */ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); batadv_gw_node_put(curr_gw); } /** * batadv_gw_election() - Elect the best gateway * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_election(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw = NULL; struct batadv_gw_node *next_gw = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) goto out; if (!bat_priv->algo_ops->gw.get_best_gw_node) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw) goto out; /* if gw.reselect is set to 1 it means that a previous call to * gw.is_eligible() said that we have a new best GW, therefore it can * now be picked from the list and selected */ next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv); if (curr_gw == next_gw) goto out; if (next_gw) { sprintf(gw_addr, "%pM", next_gw->orig_node->orig); router = batadv_orig_router_get(next_gw->orig_node, BATADV_IF_DEFAULT); if (!router) { batadv_gw_reselect(bat_priv); goto out; } router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) { batadv_gw_reselect(bat_priv); goto out; } } if (curr_gw && !next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Removing selected gateway - no gateway in range\n"); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); } else if (!curr_gw && next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, next_gw->bandwidth_up % 10, router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_ADD, gw_addr); } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Changing route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, next_gw->bandwidth_up % 10, router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_CHANGE, gw_addr); } batadv_gw_select(bat_priv, next_gw); out: batadv_gw_node_put(curr_gw); batadv_gw_node_put(next_gw); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); } /** * batadv_gw_check_election() - Elect orig node as best gateway when eligible * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked */ void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node *curr_gw_orig; /* abort immediately if the routing algorithm does not support gateway * election */ if (!bat_priv->algo_ops->gw.is_eligible) return; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) goto reselect; /* this node already is the gateway */ if (curr_gw_orig == orig_node) goto out; if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig, orig_node)) goto out; reselect: batadv_gw_reselect(bat_priv); out: batadv_orig_node_put(curr_gw_orig); } /** * batadv_gw_node_add() - add gateway node to list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information * * Has to be called with the appropriate locks being acquired * (gw.list_lock). */ static void batadv_gw_node_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway) { struct batadv_gw_node *gw_node; lockdep_assert_held(&bat_priv->gw.list_lock); if (gateway->bandwidth_down == 0) return; gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); if (!gw_node) return; kref_init(&gw_node->refcount); INIT_HLIST_NODE(&gw_node->list); kref_get(&orig_node->refcount); gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); bat_priv->gw.generation++; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", orig_node->orig, ntohl(gateway->bandwidth_down) / 10, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); /* don't return reference to new gw_node */ batadv_gw_node_put(gw_node); } /** * batadv_gw_node_get() - retrieve gateway node from list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * * Return: gateway node if found or NULL otherwise. */ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_gw_node *gw_node_tmp, *gw_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.gateway_list, list) { if (gw_node_tmp->orig_node != orig_node) continue; if (!kref_get_unless_zero(&gw_node_tmp->refcount)) continue; gw_node = gw_node_tmp; break; } rcu_read_unlock(); return gw_node; } /** * batadv_gw_node_update() - update list of available gateways with changed * bandwidth information * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information */ void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway) { struct batadv_gw_node *gw_node, *curr_gw = NULL; spin_lock_bh(&bat_priv->gw.list_lock); gw_node = batadv_gw_node_get(bat_priv, orig_node); if (!gw_node) { batadv_gw_node_add(bat_priv, orig_node, gateway); spin_unlock_bh(&bat_priv->gw.list_lock); goto out; } spin_unlock_bh(&bat_priv->gw.list_lock); if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway bandwidth of originator %pM changed from %u.%u/%u.%u MBit to %u.%u/%u.%u MBit\n", orig_node->orig, gw_node->bandwidth_down / 10, gw_node->bandwidth_down % 10, gw_node->bandwidth_up / 10, gw_node->bandwidth_up % 10, ntohl(gateway->bandwidth_down) / 10, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); if (ntohl(gateway->bandwidth_down) == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway %pM removed from gateway list\n", orig_node->orig); /* Note: We don't need a NULL check here, since curr_gw never * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); if (!hlist_unhashed(&gw_node->list)) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); batadv_gw_node_put(curr_gw); } out: batadv_gw_node_put(gw_node); } /** * batadv_gw_node_delete() - Remove orig_node from gateway list * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is currently in process of being removed */ void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tvlv_gateway_data gateway; gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; batadv_gw_node_update(bat_priv, orig_node, &gateway); } /** * batadv_gw_node_free() - Free gateway information from soft interface * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_node_free(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; spin_lock_bh(&bat_priv->gw.list_lock); hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); } /** * batadv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * * Return: Error code, or length of message */ int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; struct net_device *soft_iface; struct batadv_priv *bat_priv; int ret; soft_iface = batadv_netlink_get_softif(cb); if (IS_ERR(soft_iface)) return PTR_ERR(soft_iface); bat_priv = netdev_priv(soft_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } if (!bat_priv->algo_ops->gw.dump) { ret = -EOPNOTSUPP; goto out; } bat_priv->algo_ops->gw.dump(msg, cb, bat_priv); ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); return ret; } /** * batadv_gw_dhcp_recipient_get() - check if a packet is a DHCP message * @skb: the packet to check * @header_len: a pointer to the batman-adv header size * @chaddr: buffer where the client address will be stored. Valid * only if the function returns BATADV_DHCP_TO_CLIENT * * This function may re-allocate the data buffer of the skb passed as argument. * * Return: * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error * while parsing it * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, u8 *chaddr) { enum batadv_dhcp_recipient ret = BATADV_DHCP_NO; struct ethhdr *ethhdr; struct iphdr *iphdr; struct ipv6hdr *ipv6hdr; struct udphdr *udphdr; struct vlan_ethhdr *vhdr; int chaddr_offset; __be16 proto; u8 *p; /* check for ethernet header */ if (!pskb_may_pull(skb, *header_len + ETH_HLEN)) return BATADV_DHCP_NO; ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; *header_len += ETH_HLEN; /* check for initial vlan header */ if (proto == htons(ETH_P_8021Q)) { if (!pskb_may_pull(skb, *header_len + VLAN_HLEN)) return BATADV_DHCP_NO; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; *header_len += VLAN_HLEN; } /* check for ip header */ switch (proto) { case htons(ETH_P_IP): if (!pskb_may_pull(skb, *header_len + sizeof(*iphdr))) return BATADV_DHCP_NO; iphdr = (struct iphdr *)(skb->data + *header_len); *header_len += iphdr->ihl * 4; /* check for udp header */ if (iphdr->protocol != IPPROTO_UDP) return BATADV_DHCP_NO; break; case htons(ETH_P_IPV6): if (!pskb_may_pull(skb, *header_len + sizeof(*ipv6hdr))) return BATADV_DHCP_NO; ipv6hdr = (struct ipv6hdr *)(skb->data + *header_len); *header_len += sizeof(*ipv6hdr); /* check for udp header */ if (ipv6hdr->nexthdr != IPPROTO_UDP) return BATADV_DHCP_NO; break; default: return BATADV_DHCP_NO; } if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return BATADV_DHCP_NO; udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); /* check for bootp port */ switch (proto) { case htons(ETH_P_IP): if (udphdr->dest == htons(67)) ret = BATADV_DHCP_TO_SERVER; else if (udphdr->source == htons(67)) ret = BATADV_DHCP_TO_CLIENT; break; case htons(ETH_P_IPV6): if (udphdr->dest == htons(547)) ret = BATADV_DHCP_TO_SERVER; else if (udphdr->source == htons(547)) ret = BATADV_DHCP_TO_CLIENT; break; } chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET; /* store the client address if the message is going to a client */ if (ret == BATADV_DHCP_TO_CLIENT) { if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) return BATADV_DHCP_NO; /* check if the DHCP packet carries an Ethernet DHCP */ p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET; if (*p != BATADV_DHCP_HTYPE_ETHERNET) return BATADV_DHCP_NO; /* check if the DHCP packet carries a valid Ethernet address */ p = skb->data + *header_len + BATADV_DHCP_HLEN_OFFSET; if (*p != ETH_ALEN) return BATADV_DHCP_NO; ether_addr_copy(chaddr, skb->data + chaddr_offset); } return ret; } /** * batadv_gw_out_of_range() - check if the dhcp request destination is the best * gateway * @bat_priv: the bat priv with all the soft interface information * @skb: the outgoing packet * * Check if the skb is a DHCP request and if it is sent to the current best GW * server. Due to topology changes it may be the case that the GW server * previously selected is not the best one anymore. * * This call might reallocate skb data. * Must be invoked only when the DHCP packet is going TO a DHCP SERVER. * * Return: true if the packet destination is unicast and it is not the best gw, * false otherwise. */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_neigh_node *neigh_curr = NULL; struct batadv_neigh_node *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; struct batadv_gw_node *gw_node = NULL; struct batadv_gw_node *curr_gw = NULL; struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; struct ethhdr *ethhdr = (struct ethhdr *)skb->data; bool out_of_range = false; u8 curr_tq_avg; unsigned short vid; vid = batadv_get_vid(skb, 0); if (is_multicast_ether_addr(ethhdr->h_dest)) goto out; orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid); if (!orig_dst_node) goto out; gw_node = batadv_gw_node_get(bat_priv, orig_dst_node); if (!gw_node) goto out; switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_SERVER: /* If we are a GW then we are our best GW. We can artificially * set the tq towards ourself as the maximum value */ curr_tq_avg = BATADV_TQ_MAX_VALUE; break; case BATADV_GW_MODE_CLIENT: curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!curr_gw) goto out; /* packet is going to our gateway */ if (curr_gw->orig_node == orig_dst_node) goto out; /* If the dhcp packet has been sent to a different gw, * we have to evaluate whether the old gw is still * reliable enough */ neigh_curr = batadv_find_router(bat_priv, curr_gw->orig_node, NULL); if (!neigh_curr) goto out; curr_ifinfo = batadv_neigh_ifinfo_get(neigh_curr, BATADV_IF_DEFAULT); if (!curr_ifinfo) goto out; curr_tq_avg = curr_ifinfo->bat_iv.tq_avg; batadv_neigh_ifinfo_put(curr_ifinfo); break; case BATADV_GW_MODE_OFF: default: goto out; } neigh_old = batadv_find_router(bat_priv, orig_dst_node, NULL); if (!neigh_old) goto out; old_ifinfo = batadv_neigh_ifinfo_get(neigh_old, BATADV_IF_DEFAULT); if (!old_ifinfo) goto out; if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD) out_of_range = true; batadv_neigh_ifinfo_put(old_ifinfo); out: batadv_orig_node_put(orig_dst_node); batadv_gw_node_put(curr_gw); batadv_gw_node_put(gw_node); batadv_neigh_node_put(neigh_old); batadv_neigh_node_put(neigh_curr); return out_of_range; }
7 227 33 6 234 484 491 486 59 470 70 70 377 375 472 470 474 34 322 85 5 59 284 378 377 257 4 111 15 70 68 217 229 161 156 146 225 4 5 230 15 27 201 203 203 203 201 235 236 215 14 49 230 8 201 95 204 106 159 172 48 162 40 186 22 167 130 88 44 130 238 1 236 239 238 235 28 105 61 161 1 186 7 1 16 26 24 168 1 50 1 212 207 14 221 221 219 336 336 326 36 158 88 363 339 336 207 11 125 194 99 106 192 363 2 25 1 340 88 367 1 87 321 317 319 1 9 5 4 7 2 150 7 139 130 5 5 227 165 1 57 2 221 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 // SPDX-License-Identifier: GPL-2.0 /* * This file contains the procedures for the handling of select and poll * * Created for Linux based loosely upon Mathius Lattner's minix * patches by Peter MacDonald. Heavily edited by Linus. * * 4 February 1994 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS * flag set in its personality we do *not* modify the given timeout * parameter to reflect time remaining. * * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/rt.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/freezer.h> #include <net/busy_poll.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> /* * Estimate expected accuracy in ns from a timeval. * * After quite a bit of churning around, we've settled on * a simple thing of taking 0.1% of the timeout as the * slack, with a cap of 100 msec. * "nice" tasks get a 0.5% slack instead. * * Consider this comment an open invitation to come up with even * better solutions.. */ #define MAX_SLACK (100 * NSEC_PER_MSEC) static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; if (tv->tv_sec < 0) return 0; if (task_nice(current) > 0) divfactor = divfactor / 5; if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor)) return MAX_SLACK; slack = tv->tv_nsec / divfactor; slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); if (slack > MAX_SLACK) return MAX_SLACK; return slack; } u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; struct timespec64 now; u64 slack = current->timer_slack_ns; if (slack == 0) return 0; ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < slack) return slack; return ret; } struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to * follow, but it should be free of race-conditions, and it's practical. If you * understand what I'm doing here, then you understand how the linux * sleep/wakeup mechanism works. * * Two very simple procedures, poll_wait() and poll_freewait() make all the * work. poll_wait() is an inline-function defined in <linux/poll.h>, * as all select/poll functions have to call it to add an entry to the * poll table. */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p); void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } EXPORT_SYMBOL(poll_freewait); static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ return default_wake_function(&dummy_wait, mode, sync, key); } static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; entry = container_of(wait, struct poll_table_entry, wait); if (key && !(key_to_poll(key) & entry->key)) return 0; return __pollwake(wait, mode, sync, key); } /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; entry->filp = get_file(filp); entry->wait_address = wait_address; entry->key = p->_key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } static int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack) { int rc = -EINTR; set_current_state(state); if (!pwq->triggered) rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * Prepare for the next iteration. * * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing * doesn't pass event check of the next iteration. Note that * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ smp_store_mb(pwq->triggered, 0); return rc; } /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * * Note, we do not use a timespec for the user space value here, That * way we can use the function for timeval and compat interfaces as well. * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { ktime_get_ts64(to); *to = timespec64_add_safe(*to, ts); } return 0; } enum poll_time_type { PT_TIMEVAL = 0, PT_OLD_TIMEVAL = 1, PT_TIMESPEC = 2, PT_OLD_TIMESPEC = 3, }; static int poll_select_finish(struct timespec64 *end_time, void __user *p, enum poll_time_type pt_type, int ret) { struct timespec64 rts; restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); if (!p) return ret; if (current->personality & STICKY_TIMEOUTS) goto sticky; /* No update for zero timeout */ if (!end_time->tv_sec && !end_time->tv_nsec) return ret; ktime_get_ts64(&rts); rts = timespec64_sub(*end_time, rts); if (rts.tv_sec < 0) rts.tv_sec = rts.tv_nsec = 0; switch (pt_type) { case PT_TIMEVAL: { struct __kernel_old_timeval rtv; if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_OLD_TIMEVAL: { struct old_timeval32 rtv; rtv.tv_sec = rts.tv_sec; rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } break; case PT_TIMESPEC: if (!put_timespec64(&rts, p)) return ret; break; case PT_OLD_TIMESPEC: if (!put_old_timespec32(&rts, p)) return ret; break; default: BUG(); } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to * cause a fault after the select has completed * successfully. However, because we're not updating the * timeval, we can't restart the system call. */ sticky: if (ret == -ERESTARTNOHAND) ret = -EINTR; return ret; } /* * Scalable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ #define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) /* * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. */ static inline int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { nr = FDS_BYTES(nr); if (ufdset) return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; memset(fdset, 0, nr); return 0; } static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; } static inline void zero_fd_set(unsigned long nr, unsigned long *fdset) { memset(fdset, 0, FDS_BYTES(nr)); } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ set = ~(~0UL << (n & (BITS_PER_LONG-1))); n /= BITS_PER_LONG; fdt = files_fdtable(current->files); open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); if (set) { if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; if (set & ~*open_fds) return -EBADF; if (max) continue; get_max: do { max++; set >>= 1; } while (set); max += n * BITS_PER_LONG; } return max; } #define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ EPOLLNVAL) #define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ EPOLLNVAL) #define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, __poll_t ll_flag) { CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) wait->_key |= POLLOUT_SET; return vfs_poll(fd_file(f), wait); } static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); wait = &table.pt; if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { wait->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; __poll_t mask; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += BITS_PER_LONG; continue; } for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) { if (i >= n) break; if (!(bit & all_bits)) continue; mask = select_poll_one(i, wait, in, out, bit, busy_flag); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait->_qproc = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait->_qproc = NULL; } /* got something, stop busy polling */ if (retval) { can_busy_loop = false; busy_flag = 0; /* * only remember a returned * POLL_BUSY_LOOP if we asked for it */ } else if (busy_flag & mask) can_busy_loop = true; } if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; cond_resched(); } wait->_qproc = NULL; if (retval || timed_out || signal_pending(current)) break; if (table.error) { retval = table.error; break; } /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } poll_freewait(&table); return retval; } /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; size_t size, alloc_size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; if (size > (SIZE_MAX / 6)) goto out_nofds; alloc_size = 6 * size; bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kvfree(bits); out_nofds: return ret; } static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct __kernel_old_timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret); } SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp) { return kern_select(n, inp, outp, exp, tvp); } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, void __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } /* * Most architectures can't handle 7-argument syscalls. So we provide a * 6-argument version where the sixth argument is a pointer to a structure * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ struct sigset_argpack { sigset_t __user *p; size_t size; }; static inline int get_sigset_argpack(struct sigset_argpack *to, struct sigset_argpack __user *from) { // the path is hot enough for overhead of copy_from_user() to matter if (from) { if (can_do_masked_user_access()) from = masked_user_access_begin(from); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct sigset_argpack x = {NULL, 0}; if (get_sigset_argpack(&x, sig)) return -EFAULT; return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC); } #endif #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; fd_set __user *inp, *outp, *exp; struct __kernel_old_timeval __user *tvp; }; SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) { struct sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif struct poll_list { struct poll_list *next; unsigned int len; struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, bool *can_busy_poll, __poll_t busy_flag) { int fd = pollfd->fd; __poll_t mask, filter; if (fd < 0) return 0; CLASS(fd, f)(fd); if (fd_empty(f)) return EPOLLNVAL; /* userland u16 ->events contains POLL... bitmap */ filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP; pwait->_key = filter | busy_flag; mask = vfs_poll(fd_file(f), pwait); if (mask & busy_flag) *can_busy_poll = true; return mask & filter; /* Mask out unneeded events. */ } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; u64 slack = 0; __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt->_qproc = NULL; timed_out = 1; } if (end_time && !timed_out) slack = select_estimate_accuracy(end_time); for (;;) { struct poll_list *walk; bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { __poll_t mask; /* * Fish for events. If we found one, record it * and kill poll_table->_qproc, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag); pfd->revents = mangle_poll(mask); if (mask) { count++; pt->_qproc = NULL; /* found something, stop busy polling */ busy_flag = 0; can_busy_loop = false; } } } /* * All waiters have already been registered, so don't provide * a poll_table->_qproc to them on the next loop iteration. */ pt->_qproc = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -ERESTARTNOHAND; } if (count || timed_out) break; /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { if (!busy_start) { busy_start = busy_loop_current_time(); continue; } if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec64_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned int todo = nfds; unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; if (walk->len >= todo) break; todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); fdcount = do_poll(head, &table, end_time); poll_freewait(&table); if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) goto out_fds; for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); } user_write_access_end(); err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; Efault: user_write_access_end(); err = -EFAULT; goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { end_time.tv_sec = restart_block->poll.tv_sec; end_time.tv_nsec = restart_block->poll.tv_nsec; to = &end_time; } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) ret = set_restart_fn(restart_block, do_restart_poll); return ret; } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } ret = do_sys_poll(ufds, nfds, to); if (ret == -ERESTARTNOHAND) { struct restart_block *restart_block; restart_block = &current->restart_block; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = set_restart_fn(restart_block, do_restart_poll); } return ret; } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif #ifdef CONFIG_COMPAT #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to * 64-bit unsigned longs. */ static int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { zero_fd_set(nr, fdset); return 0; } } static int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, unsigned long *fdset) { if (!ufdset) return 0; return compat_put_bitmap(ufdset, fdset, nr); } /* * This is a virtual copy of sys_select from fs/select.c and probably * should be compared to it from time to time */ /* * We can actually return ERESTARTSYS instead of EINTR, but I'd * like to be certain this leads to no problems. So I return * EINTR just for safety. * * Update: ERESTARTSYS breaks at least the xview clock binary, so * I'm trying ERESTARTNOHAND which restart only when you want to. */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; int size, max_fds, ret = -EINVAL; struct fdtable *fdt; long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { bits = kmalloc_array(6, size, GFP_KERNEL); ret = -ENOMEM; if (!bits) goto out_nofds; } fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); fds.res_in = (unsigned long *) (bits + 3*size); fds.res_out = (unsigned long *) (bits + 4*size); fds.res_ex = (unsigned long *) (bits + 5*size); if ((ret = compat_get_fd_set(n, inp, fds.in)) || (ret = compat_get_fd_set(n, outp, fds.out)) || (ret = compat_get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (compat_set_fd_set(n, inp, fds.res_in) || compat_set_fd_set(n, outp, fds.res_out) || compat_set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } static int do_compat_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct old_timeval32 __user *tvp) { struct timespec64 end_time, *to = NULL; struct old_timeval32 tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret); } COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timeval32 __user *, tvp) { return do_compat_select(n, inp, outp, exp, tvp); } struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; compat_uptr_t outp; compat_uptr_t exp; compat_uptr_t tvp; }; COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) { struct compat_sel_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { switch (type) { case PT_OLD_TIMESPEC: if (get_old_timespec32(&ts, tsp)) return -EFAULT; break; case PT_TIMESPEC: if (get_timespec64(&ts, tsp)) return -EFAULT; break; default: BUG(); } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); return poll_select_finish(&end_time, tsp, type, ret); } struct compat_sigset_argpack { compat_uptr_t p; compat_size_t size; }; static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to, struct compat_sigset_argpack __user *from) { if (from) { if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); user_read_access_end(); } return 0; Efault: user_read_access_end(); return -EFAULT; } COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_TIMESPEC); } #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) { struct compat_sigset_argpack x = {0, 0}; if (get_compat_sigset_argpack(&x, sig)) return -EFAULT; return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p), x.size, PT_OLD_TIMESPEC); } #endif #if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_old_timespec32(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret); } #endif /* New compat syscall for 64 bit time_t*/ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret); } #endif
295 1 1 153 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 // SPDX-License-Identifier: GPL-2.0-only #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H #include <asm/vmx.h> #include "mmu.h" #include "mmu_internal.h" /* * A MMU present SPTE is backed by actual memory and may or may not be present * in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it * is ignored by all flavors of SPTEs and checking a low bit often generates * better code than for a high bit, e.g. 56+. MMU present checks are pervasive * enough that the improved code generation is noticeable in KVM's footprint. */ #define SPTE_MMU_PRESENT_MASK BIT_ULL(11) /* * TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also * be restricted to using write-protection (for L2 when CPU dirty logging, i.e. * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that * is must be employed for a given TDP SPTE. * * Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE * paging, including NPT PAE. This scheme works because legacy shadow paging * is guaranteed to have A/D bits and write-protection is forced only for * TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it * must be restricted to 64-bit KVM. */ #define SPTE_TDP_AD_SHIFT 52 #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) #define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) static_assert(SPTE_TDP_AD_ENABLED == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK #define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else #define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif #define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ #define SPTE_EPT_READABLE_MASK 0x1ull #define SPTE_EPT_EXECUTABLE_MASK 0x4ull #define SPTE_LEVEL_BITS 9 #define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) #define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) #define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the * PTEs being access tracked also need to be dirty tracked, so the W bit will be * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); /* * {DEFAULT,EPT}_SPTE_{HOST,MMU}_WRITABLE are used to keep track of why a given * SPTE is write-protected. See is_writable_pte() for details. */ /* Bits 9 and 10 are ignored by all non-EPT PTEs. */ #define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9) #define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10) /* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care * to not overlap the A/D type mask or the saved access bits of access-tracked * SPTEs when A/D bits are disabled. */ #define EPT_SPTE_HOST_WRITABLE BIT_ULL(57) #define EPT_SPTE_MMU_WRITABLE BIT_ULL(58) static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK)); static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK)); static_assert(!(EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); /* Defined only to keep the above static asserts readable. */ #undef SHADOW_ACC_TRACK_SAVED_MASK /* * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10 * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62 * * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in * the MMIO generation number, as doing so would require stealing a bit from * the "real" generation number and thus effectively halve the maximum number * of MMIO generations that can be handled before encountering a wrap (which * requires a full MMU zap). The flag is instead explicitly queried when * checking for MMIO spte cache hits. */ #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 10 #define MMIO_SPTE_GEN_HIGH_START 52 #define MMIO_SPTE_GEN_HIGH_END 62 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ MMIO_SPTE_GEN_LOW_START) #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ MMIO_SPTE_GEN_HIGH_START) static_assert(!(SPTE_MMU_PRESENT_MASK & (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); /* * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the * MMU-present bit. The generation obviously co-exists with the magic MMIO * mask/value, and MMIO SPTEs are considered !MMU-present. * * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO * and so they're off-limits for generation; additional checks ensure the mask * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). */ #define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) static_assert(!(SPTE_MMIO_ALLOWED_MASK & (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) /* remember to adjust the comment above as well if you change these */ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0) #define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS) #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) /* * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress * #VE and get EPT violations on non-present PTEs. We can use the * same value also without TDX for both VMX and SVM: * * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) */ #ifdef CONFIG_X86_64 #define SHADOW_NONPRESENT_VALUE BIT_ULL(63) static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); #else #define SHADOW_NONPRESENT_VALUE 0ULL #endif /* * True if A/D bits are supported in hardware and are enabled by KVM. When * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario * where KVM is using A/D bits for L1, but not L2. */ extern bool __read_mostly kvm_ad_enabled; extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ extern u64 __read_mostly shadow_user_mask; extern u64 __read_mostly shadow_accessed_mask; extern u64 __read_mostly shadow_dirty_mask; extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; /* * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ extern u64 __read_mostly shadow_acc_track_mask; /* * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order * to guard against L1TF attacks. */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* * The number of high-order 1 bits to use in the mask above. */ #define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5 /* * If a thread running without exclusive control of the MMU lock must perform a * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF * vulnerability. * * Only used by the TDP MMU. */ #define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) /* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); static inline bool is_frozen_spte(u64 spte) { return spte == FROZEN_SPTE; } /* Get an SPTE's index into its parent's page table (and the spt array). */ static inline int spte_index(u64 *sptep) { return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); } /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask * left into the reserved bits, i.e. the GFN in the SPTE will be split into * high and low parts. This mask covers the lower bits of the GFN. */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); return (struct kvm_mmu_page *)page_private(page); } static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) { return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); } static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) { return to_shadow_page(__pa(sptep)); } static inline struct kvm_mmu_page *root_to_sp(hpa_t root) { if (kvm_mmu_is_dummy_root(root)) return NULL; /* * The "root" may be a special root, e.g. a PAE entry, treat it as a * SPTE to ensure any non-PA bits are dropped. */ return spte_to_child_sp(root); } static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) { return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && likely(enable_mmio_caching); } static inline bool is_shadow_present_pte(u64 pte) { return !!(pte & SPTE_MMU_PRESENT_MASK); } static inline bool is_ept_ve_possible(u64 spte) { return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && !(spte & VMX_EPT_SUPPRESS_VE_BIT) && (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; } static inline bool spte_ad_enabled(u64 spte) { KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); /* * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit * TDP and do the A/D type check unconditionally. */ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } static inline bool is_access_track_spte(u64 spte) { return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } static inline bool is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; } static inline bool is_last_spte(u64 pte, int level) { return (level == PG_LEVEL_4K) || is_large_pte(pte); } static inline bool is_executable_pte(u64 spte) { return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; } static inline kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) { return spte & shadow_accessed_mask; } static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { int bit7 = (pte >> 7) & 1; return rsvd_check->rsvd_bits_mask[bit7][level-1]; } static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { return pte & get_rsvd_bits(rsvd_check, pte, level); } static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) { return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); } static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, u64 spte, int level) { return __is_bad_mt_xwr(rsvd_check, spte) || __is_rsvd_bits_set(rsvd_check, spte, level); } /* * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: * * 1. To intercept writes for dirty logging. KVM write-protects huge pages * so that they can be split down into the dirty logging * granularity (4KiB) whenever the guest writes to them. KVM also * write-protects 4KiB pages so that writes can be recorded in the dirty log * (e.g. if not using PML). SPTEs are write-protected for dirty logging * during the VM-iotcls that enable dirty logging. * * 2. To intercept writes to guest page tables that KVM is shadowing. When a * guest writes to its page table the corresponding shadow page table will * be marked "unsync". That way KVM knows which shadow page tables need to * be updated on the next TLB flush, INVLPG, etc. and which do not. * * 3. To prevent guest writes to read-only memory, such as for memory in a * read-only memslot or guest memory backed by a read-only VMA. Writes to * such pages are disallowed entirely. * * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this * case, the SPTE is access-protected, not just write-protected! * * For cases #1 and #4, KVM can safely make such SPTEs writable without taking * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits * in the SPTE: * * shadow_mmu_writable_mask, aka MMU-writable - * Cleared on SPTEs that KVM is currently write-protecting for shadow paging * purposes (case 2 above). * * shadow_host_writable_mask, aka Host-writable - * Cleared on SPTEs that are not host-writable (case 3 above) * * Note, not all possible combinations of PT_WRITABLE_MASK, * shadow_mmu_writable_mask, and shadow_host_writable_mask are valid. A given * SPTE can be in only one of the following states, which map to the * aforementioned 3 cases: * * shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK * ------------------------- | ------------------------ | ---------------- * 1 | 1 | 1 (writable) * 1 | 1 | 0 (case 1) * 1 | 0 | 0 (case 2) * 0 | 0 | 0 (case 3) * * The valid combinations of these bits are checked by * check_spte_writable_invariants() whenever an SPTE is modified. * * Clearing the MMU-writable bit is always done under the MMU lock and always * accompanied by a TLB flush before dropping the lock to avoid corrupting the * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging * (which does not clear the MMU-writable bit), does not flush TLBs before * dropping the lock, as it only needs to synchronize guest writes with the * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for * access-tracking via the clear_young() MMU notifier also does not flush TLBs. * * So, there is the problem: clearing the MMU-writable bit can encounter a * write-protected SPTE while CPUs still have writable mappings for that SPTE * cached in their TLB. To address this, KVM always flushes TLBs when * write-protecting SPTEs if the MMU-writable bit is set on the old SPTE. * * The Host-writable bit is not modified on present SPTEs, it is only set or * cleared when an SPTE is first faulted in from non-present and then remains * immutable. */ static inline bool is_writable_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; } /* Note: spte must be a shadow-present leaf SPTE. */ static inline void check_spte_writable_invariants(u64 spte) { if (spte & shadow_mmu_writable_mask) WARN_ONCE(!(spte & shadow_host_writable_mask), KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", spte); else WARN_ONCE(is_writable_pte(spte), KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); } static inline bool is_mmu_writable_spte(u64 spte) { return spte & shadow_mmu_writable_mask; } /* * Returns true if the access indicated by @fault is allowed by the existing * SPTE protections. Note, the caller is responsible for checking that the * SPTE is a shadow-present, leaf SPTE (either before or after). */ static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { if (fault->exec) return is_executable_pte(spte); if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ return spte & PT_PRESENT_MASK; } /* * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, * not whether or not SPTEs were modified, i.e. only the write-tracking case * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. * * Don't flush if the Accessed bit is cleared, as access tracking tolerates * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a * mmu_notifier.clear_flush_young() event. * * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and * when clearing dirty logs, KVM flushes based on whether or not dirty entries * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. * * Note, this logic only applies to shadow-present leaf SPTEs. The caller is * responsible for checking that the old SPTE is shadow-present, and is also * responsible for determining whether or not a TLB flush is required when * modifying a shadow-present non-leaf SPTE. */ static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) { return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte); } static inline u64 get_mmio_spte_generation(u64 spte) { u64 gen; gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT; return gen; } bool spte_has_volatile_bits(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte); u64 make_small_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index); u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); /* Restore an acc-track PTE back to a regular PTE */ static inline u64 restore_acc_track_spte(u64 spte) { u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) & SHADOW_ACC_TRACK_SAVED_BITS_MASK; spte &= ~shadow_acc_track_mask; spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); spte |= saved_bits; return spte; } void __init kvm_mmu_spte_module_init(void); void kvm_mmu_reset_all_pte_masks(void); #endif
1 1 1 3 3 3 1 3 10 10 2 5 2 5 2 2 3 2 3 12 12 12 12 12 4 2 2 2 2 2 13 14 1 2 6 6 6 1 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 // SPDX-License-Identifier: GPL-2.0-only /* * * general timer device for using in ISDN stacks * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/poll.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mISDNif.h> #include <linux/mutex.h> #include <linux/sched/signal.h> #include "core.h" static DEFINE_MUTEX(mISDN_mutex); static u_int *debug; struct mISDNtimerdev { int next_id; struct list_head pending; struct list_head expired; wait_queue_head_t wait; u_int work; spinlock_t lock; /* protect lists */ }; struct mISDNtimer { struct list_head list; struct mISDNtimerdev *dev; struct timer_list tl; int id; }; static int mISDN_open(struct inode *ino, struct file *filep) { struct mISDNtimerdev *dev; if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s(%p,%p)\n", __func__, ino, filep); dev = kmalloc(sizeof(struct mISDNtimerdev) , GFP_KERNEL); if (!dev) return -ENOMEM; dev->next_id = 1; INIT_LIST_HEAD(&dev->pending); INIT_LIST_HEAD(&dev->expired); spin_lock_init(&dev->lock); dev->work = 0; init_waitqueue_head(&dev->wait); filep->private_data = dev; return nonseekable_open(ino, filep); } static int mISDN_close(struct inode *ino, struct file *filep) { struct mISDNtimerdev *dev = filep->private_data; struct list_head *list = &dev->pending; struct mISDNtimer *timer, *next; if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s(%p,%p)\n", __func__, ino, filep); spin_lock_irq(&dev->lock); while (!list_empty(list)) { timer = list_first_entry(list, struct mISDNtimer, list); spin_unlock_irq(&dev->lock); timer_shutdown_sync(&timer->tl); spin_lock_irq(&dev->lock); /* it might have been moved to ->expired */ list_del(&timer->list); kfree(timer); } spin_unlock_irq(&dev->lock); list_for_each_entry_safe(timer, next, &dev->expired, list) { kfree(timer); } kfree(dev); return 0; } static ssize_t mISDN_read(struct file *filep, char __user *buf, size_t count, loff_t *off) { struct mISDNtimerdev *dev = filep->private_data; struct list_head *list = &dev->expired; struct mISDNtimer *timer; int ret = 0; if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s(%p, %p, %d, %p)\n", __func__, filep, buf, (int)count, off); if (count < sizeof(int)) return -ENOSPC; spin_lock_irq(&dev->lock); while (list_empty(list) && (dev->work == 0)) { spin_unlock_irq(&dev->lock); if (filep->f_flags & O_NONBLOCK) return -EAGAIN; wait_event_interruptible(dev->wait, (dev->work || !list_empty(list))); if (signal_pending(current)) return -ERESTARTSYS; spin_lock_irq(&dev->lock); } if (dev->work) dev->work = 0; if (!list_empty(list)) { timer = list_first_entry(list, struct mISDNtimer, list); list_del(&timer->list); spin_unlock_irq(&dev->lock); if (put_user(timer->id, (int __user *)buf)) ret = -EFAULT; else ret = sizeof(int); kfree(timer); } else { spin_unlock_irq(&dev->lock); } return ret; } static __poll_t mISDN_poll(struct file *filep, poll_table *wait) { struct mISDNtimerdev *dev = filep->private_data; __poll_t mask = EPOLLERR; if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s(%p, %p)\n", __func__, filep, wait); if (dev) { poll_wait(filep, &dev->wait, wait); mask = 0; if (dev->work || !list_empty(&dev->expired)) mask |= (EPOLLIN | EPOLLRDNORM); if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s work(%d) empty(%d)\n", __func__, dev->work, list_empty(&dev->expired)); } return mask; } static void dev_expire_timer(struct timer_list *t) { struct mISDNtimer *timer = from_timer(timer, t, tl); u_long flags; spin_lock_irqsave(&timer->dev->lock, flags); if (timer->id >= 0) list_move_tail(&timer->list, &timer->dev->expired); wake_up_interruptible(&timer->dev->wait); spin_unlock_irqrestore(&timer->dev->lock, flags); } static int misdn_add_timer(struct mISDNtimerdev *dev, int timeout) { int id; struct mISDNtimer *timer; if (!timeout) { dev->work = 1; wake_up_interruptible(&dev->wait); id = 0; } else { timer = kzalloc(sizeof(struct mISDNtimer), GFP_KERNEL); if (!timer) return -ENOMEM; timer->dev = dev; timer_setup(&timer->tl, dev_expire_timer, 0); spin_lock_irq(&dev->lock); id = timer->id = dev->next_id++; if (dev->next_id < 0) dev->next_id = 1; list_add_tail(&timer->list, &dev->pending); timer->tl.expires = jiffies + ((HZ * (u_long)timeout) / 1000); add_timer(&timer->tl); spin_unlock_irq(&dev->lock); } return id; } static int misdn_del_timer(struct mISDNtimerdev *dev, int id) { struct mISDNtimer *timer; spin_lock_irq(&dev->lock); list_for_each_entry(timer, &dev->pending, list) { if (timer->id == id) { list_del_init(&timer->list); timer->id = -1; spin_unlock_irq(&dev->lock); timer_shutdown_sync(&timer->tl); kfree(timer); return id; } } spin_unlock_irq(&dev->lock); return 0; } static long mISDN_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct mISDNtimerdev *dev = filep->private_data; int id, tout, ret = 0; if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s(%p, %x, %lx)\n", __func__, filep, cmd, arg); mutex_lock(&mISDN_mutex); switch (cmd) { case IMADDTIMER: if (get_user(tout, (int __user *)arg)) { ret = -EFAULT; break; } id = misdn_add_timer(dev, tout); if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s add %d id %d\n", __func__, tout, id); if (id < 0) { ret = id; break; } if (put_user(id, (int __user *)arg)) ret = -EFAULT; break; case IMDELTIMER: if (get_user(id, (int __user *)arg)) { ret = -EFAULT; break; } if (*debug & DEBUG_TIMER) printk(KERN_DEBUG "%s del id %d\n", __func__, id); id = misdn_del_timer(dev, id); if (put_user(id, (int __user *)arg)) ret = -EFAULT; break; default: ret = -EINVAL; } mutex_unlock(&mISDN_mutex); return ret; } static const struct file_operations mISDN_fops = { .owner = THIS_MODULE, .read = mISDN_read, .poll = mISDN_poll, .unlocked_ioctl = mISDN_ioctl, .open = mISDN_open, .release = mISDN_close, }; static struct miscdevice mISDNtimer = { .minor = MISC_DYNAMIC_MINOR, .name = "mISDNtimer", .fops = &mISDN_fops, }; int mISDN_inittimer(u_int *deb) { int err; debug = deb; err = misc_register(&mISDNtimer); if (err) printk(KERN_WARNING "mISDN: Could not register timer device\n"); return err; } void mISDN_timer_cleanup(void) { misc_deregister(&mISDNtimer); }
6 6 6 7 7 5 6 7 5 1 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 1 1 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/iomap.h> #include <linux/ktime.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "inode.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "log.h" #include "super.h" #include "trans.h" #include "dir.h" #include "util.h" #include "aops.h" #include "trace_gfs2.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k * block is 512, so __u16 is fine for that. It saves stack space to * keep it small. */ struct metapath { struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; __u16 mp_list[GFS2_MAX_META_HEIGHT]; int mp_fheight; /* find_metapath height */ int mp_aheight; /* actual height (lookup height) */ }; static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length); /** * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated * @folio: The folio. * * Returns: errno */ static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct folio *folio) { struct inode *inode = &ip->i_inode; if (!folio_test_uptodate(folio)) { void *kaddr = kmap_local_folio(folio, 0); u64 dsize = i_size_read(inode); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, folio_size(folio) - dsize); kunmap_local(kaddr); folio_mark_uptodate(folio); } if (gfs2_is_jdata(ip)) { struct buffer_head *bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, BIT(inode->i_blkbits), BIT(BH_Uptodate)); if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } else { folio_mark_dirty(folio); gfs2_ordered_add_inode(ip); } return 0; } static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *bh, *dibh; struct gfs2_dinode *di; u64 block = 0; int isdir = gfs2_is_dir(ip); int error; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) return error; if (i_size_read(&ip->i_inode)) { /* Get a free block, fill it with the stuffed data, and write it out to disk */ unsigned int n = 1; error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) goto out_brelse; if (isdir) { gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1); error = gfs2_dir_get_new_buffer(ip, block, &bh); if (error) goto out_brelse; gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header), dibh, sizeof(struct gfs2_dinode)); brelse(bh); } else { error = gfs2_unstuffer_folio(ip, dibh, block, folio); if (error) goto out_brelse; } } /* Set up the pointer to the new block */ gfs2_trans_add_meta(ip->i_gl, dibh); di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); if (i_size_read(&ip->i_inode)) { *(__be64 *)(di + 1) = cpu_to_be64(block); gfs2_add_inode_blocks(&ip->i_inode, 1); di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); } ip->i_height = 1; di->di_height = cpu_to_be16(1); out_brelse: brelse(dibh); return error; } /** * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big * @ip: The GFS2 inode to unstuff * * This routine unstuffs a dinode and returns it to a "normal" state such * that the height can be grown in the traditional way. * * Returns: errno */ int gfs2_unstuff_dinode(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; struct folio *folio; int error; down_write(&ip->i_rw_mutex); folio = filemap_grab_folio(inode->i_mapping, 0); error = PTR_ERR(folio); if (IS_ERR(folio)) goto out; error = __gfs2_unstuff_inode(ip, folio); folio_unlock(folio); folio_put(folio); out: up_write(&ip->i_rw_mutex); return error; } /** * find_metapath - Find path through the metadata tree * @sdp: The superblock * @block: The disk block to look up * @mp: The metapath to return the result in * @height: The pre-calculated height of the metadata tree * * This routine returns a struct metapath structure that defines a path * through the metadata of inode "ip" to get to block "block". * * Example: * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a * filesystem with a blocksize of 4096. * * find_metapath() would return a struct metapath structure set to: * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165. * * That means that in order to get to the block containing the byte at * offset 101342453, we would load the indirect block pointed to by pointer * 0 in the dinode. We would then load the indirect block pointed to by * pointer 48 in that indirect block. We would then load the data block * pointed to by pointer 165 in that indirect block. * * ---------------------------------------- * | Dinode | | * | | 4| * | |0 1 2 3 4 5 9| * | | 6| * ---------------------------------------- * | * | * V * ---------------------------------------- * | Indirect Block | * | 5| * | 4 4 4 4 4 5 5 1| * |0 5 6 7 8 9 0 1 2| * ---------------------------------------- * | * | * V * ---------------------------------------- * | Indirect Block | * | 1 1 1 1 1 5| * | 6 6 6 6 6 1| * |0 3 4 5 6 7 2| * ---------------------------------------- * | * | * V * ---------------------------------------- * | Data block containing offset | * | 101342453 | * | | * | | * ---------------------------------------- * */ static void find_metapath(const struct gfs2_sbd *sdp, u64 block, struct metapath *mp, unsigned int height) { unsigned int i; mp->mp_fheight = height; for (i = height; i--;) mp->mp_list[i] = do_div(block, sdp->sd_inptrs); } static inline unsigned int metapath_branch_start(const struct metapath *mp) { if (mp->mp_list[0] == 0) return 2; return 1; } /** * metaptr1 - Return the first possible metadata pointer in a metapath buffer * @height: The metadata height (0 = dinode) * @mp: The metapath */ static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp) { struct buffer_head *bh = mp->mp_bh[height]; if (height == 0) return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode))); return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header))); } /** * metapointer - Return pointer to start of metadata in a buffer * @height: The metadata height (0 = dinode) * @mp: The metapath * * Return a pointer to the block number of the next height of the metadata * tree given a buffer containing the pointer to the current height of the * metadata tree. */ static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) { __be64 *p = metaptr1(height, mp); return p + mp->mp_list[height]; } static inline const __be64 *metaend(unsigned int height, const struct metapath *mp) { const struct buffer_head *bh = mp->mp_bh[height]; return (const __be64 *)(bh->b_data + bh->b_size); } static void clone_metapath(struct metapath *clone, struct metapath *mp) { unsigned int hgt; *clone = *mp; for (hgt = 0; hgt < mp->mp_aheight; hgt++) get_bh(clone->mp_bh[hgt]); } static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) { const __be64 *t; for (t = start; t < end; t++) { struct buffer_head *rabh; if (!*t) continue; rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | REQ_PRIO, rabh); continue; } unlock_buffer(rabh); } brelse(rabh); } } static inline struct buffer_head * metapath_dibh(struct metapath *mp) { return mp->mp_bh[0]; } static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, unsigned int x, unsigned int h) { for (; x < h; x++) { __be64 *ptr = metapointer(x, mp); u64 dblock = be64_to_cpu(*ptr); int ret; if (!dblock) break; ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]); if (ret) return ret; } mp->mp_aheight = x + 1; return 0; } /** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode * @mp: The metapath * * Assumes that the inode's buffer has already been looked up and * hooked onto mp->mp_bh[0] and that the metapath has been initialised * by find_metapath(). * * If this function encounters part of the tree which has not been * allocated, it returns the current height of the tree at the point * at which it found the unallocated block. Blocks which are found are * added to the mp->mp_bh[] list. * * Returns: error */ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { return __fillup_metapath(ip, mp, 0, ip->i_height - 1); } /** * fillup_metapath - fill up buffers for the metadata path to a specific height * @ip: The inode * @mp: The metapath * @h: The height to which it should be mapped * * Similar to lookup_metapath, but does lookups for a range of heights * * Returns: error or the number of buffers filled */ static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) { unsigned int x = 0; int ret; if (h) { /* find the first buffer we need to look up. */ for (x = h - 1; x > 0; x--) { if (mp->mp_bh[x]) break; } } ret = __fillup_metapath(ip, mp, x, h); if (ret) return ret; return mp->mp_aheight - x - 1; } static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp) { sector_t factor = 1, block = 0; int hgt; for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) { if (hgt < mp->mp_aheight) block += mp->mp_list[hgt] * factor; factor *= sdp->sd_inptrs; } return block; } static void release_metapath(struct metapath *mp) { int i; for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); mp->mp_bh[i] = NULL; } } /** * gfs2_extent_length - Returns length of an extent of blocks * @bh: The metadata block * @ptr: Current position in @bh * @eob: Set to 1 if we hit "end of block" * * Returns: The length of the extent (minimum of one block) */ static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob) { const __be64 *end = (__be64 *)(bh->b_data + bh->b_size); const __be64 *first = ptr; u64 d = be64_to_cpu(*ptr); *eob = 0; do { ptr++; if (ptr >= end) break; d++; } while(be64_to_cpu(*ptr) == d); if (ptr >= end) *eob = 1; return ptr - first; } enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE }; /* * gfs2_metadata_walker - walk an indirect block * @mp: Metapath to indirect block * @ptrs: Number of pointers to look at * * When returning WALK_FOLLOW, the walker must update @mp to point at the right * indirect block to follow. */ typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp, unsigned int ptrs); /* * gfs2_walk_metadata - walk a tree of indirect blocks * @inode: The inode * @mp: Starting point of walk * @max_len: Maximum number of blocks to walk * @walker: Called during the walk * * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or * past the end of metadata, and a negative error code otherwise. */ static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp, u64 max_len, gfs2_metadata_walker walker) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); u64 factor = 1; unsigned int hgt; int ret; /* * The walk starts in the lowest allocated indirect block, which may be * before the position indicated by @mp. Adjust @max_len accordingly * to avoid a short walk. */ for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) { max_len += mp->mp_list[hgt] * factor; mp->mp_list[hgt] = 0; factor *= sdp->sd_inptrs; } for (;;) { u16 start = mp->mp_list[hgt]; enum walker_status status; unsigned int ptrs; u64 len; /* Walk indirect block. */ ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start; len = ptrs * factor; if (len > max_len) ptrs = DIV_ROUND_UP_ULL(max_len, factor); status = walker(mp, ptrs); switch (status) { case WALK_STOP: return 1; case WALK_FOLLOW: BUG_ON(mp->mp_aheight == mp->mp_fheight); ptrs = mp->mp_list[hgt] - start; len = ptrs * factor; break; case WALK_CONTINUE: break; } if (len >= max_len) break; max_len -= len; if (status == WALK_FOLLOW) goto fill_up_metapath; lower_metapath: /* Decrease height of metapath. */ brelse(mp->mp_bh[hgt]); mp->mp_bh[hgt] = NULL; mp->mp_list[hgt] = 0; if (!hgt) break; hgt--; factor *= sdp->sd_inptrs; /* Advance in metadata tree. */ (mp->mp_list[hgt])++; if (hgt) { if (mp->mp_list[hgt] >= sdp->sd_inptrs) goto lower_metapath; } else { if (mp->mp_list[hgt] >= sdp->sd_diptrs) break; } fill_up_metapath: /* Increase height of metapath. */ ret = fillup_metapath(ip, mp, ip->i_height - 1); if (ret < 0) return ret; hgt += ret; for (; ret; ret--) do_div(factor, sdp->sd_inptrs); mp->mp_aheight = hgt + 1; } return 0; } static enum walker_status gfs2_hole_walker(struct metapath *mp, unsigned int ptrs) { const __be64 *start, *ptr, *end; unsigned int hgt; hgt = mp->mp_aheight - 1; start = metapointer(hgt, mp); end = start + ptrs; for (ptr = start; ptr < end; ptr++) { if (*ptr) { mp->mp_list[hgt] += ptr - start; if (mp->mp_aheight == mp->mp_fheight) return WALK_STOP; return WALK_FOLLOW; } } return WALK_CONTINUE; } /** * gfs2_hole_size - figure out the size of a hole * @inode: The inode * @lblock: The logical starting block number * @len: How far to look (in blocks) * @mp: The metapath at lblock * @iomap: The iomap to store the hole size in * * This function modifies @mp. * * Returns: errno on error */ static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len, struct metapath *mp, struct iomap *iomap) { struct metapath clone; u64 hole_size; int ret; clone_metapath(&clone, mp); ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker); if (ret < 0) goto out; if (ret == 1) hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock; else hole_size = len; iomap->length = hole_size << inode->i_blkbits; ret = 0; out: release_metapath(&clone); return ret; } static inline void gfs2_indirect_init(struct metapath *mp, struct gfs2_glock *gl, unsigned int i, unsigned offset, u64 bn) { __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + ((i > 1) ? sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode))); BUG_ON(i < 1); BUG_ON(mp->mp_bh[i] != NULL); mp->mp_bh[i] = gfs2_meta_new(gl, bn); gfs2_trans_add_meta(gl, mp->mp_bh[i]); gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); ptr += offset; *ptr = cpu_to_be64(bn); } enum alloc_state { ALLOC_DATA = 0, ALLOC_GROW_DEPTH = 1, ALLOC_GROW_HEIGHT = 2, /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ }; /** * __gfs2_iomap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode * @iomap: The iomap structure * @mp: The metapath, with proper height information calculated * * In this routine we may have to alloc: * i) Indirect blocks to grow the metadata tree height * ii) Indirect blocks to fill in lower part of the metadata tree * iii) Data blocks * * This function is called after __gfs2_iomap_get, which works out the * total number of blocks which we need via gfs2_alloc_size. * * We then do the actual allocation asking for an extent at a time (if * enough contiguous free blocks are available, there will only be one * allocation request per call) and uses the state machine to initialise * the blocks in order. * * Right now, this function will allocate at most one indirect block * worth of data -- with a default block size of 4K, that's slightly * less than 2M. If this limitation is ever removed to allow huge * allocations, we would probably still want to limit the iomap size we * return to avoid stalling other tasks during huge writes; the next * iomap iteration would then find the blocks already allocated. * * Returns: errno on error */ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh = metapath_dibh(mp); u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; size_t dblks = iomap->length >> inode->i_blkbits; const unsigned end_of_metadata = mp->mp_fheight - 1; int ret; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; BUG_ON(mp->mp_aheight < 1); BUG_ON(dibh == NULL); BUG_ON(dblks < 1); gfs2_trans_add_meta(ip->i_gl, dibh); down_write(&ip->i_rw_mutex); if (mp->mp_fheight == mp->mp_aheight) { /* Bottom indirect block exists */ state = ALLOC_DATA; } else { /* Need to allocate indirect blocks */ if (mp->mp_fheight == ip->i_height) { /* Writing into existing tree, extend tree down */ iblks = mp->mp_fheight - mp->mp_aheight; state = ALLOC_GROW_DEPTH; } else { /* Building up tree height */ state = ALLOC_GROW_HEIGHT; iblks = mp->mp_fheight - ip->i_height; branch_start = metapath_branch_start(mp); iblks += (mp->mp_fheight - branch_start); } } /* start of the second part of the function (state machine) */ blks = dblks + iblks; i = mp->mp_aheight; do { n = blks - alloced; ret = gfs2_alloc_blocks(ip, &bn, &n, 0); if (ret) goto out; alloced += n; if (state != ALLOC_DATA || gfs2_is_jdata(ip)) gfs2_trans_remove_revoke(sdp, bn, n); switch (state) { /* Growing height of tree */ case ALLOC_GROW_HEIGHT: if (i == 1) { ptr = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); zero_bn = *ptr; } for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); if (i - 1 == mp->mp_fheight - ip->i_height) { i--; gfs2_buffer_copy_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header), dibh, sizeof(struct gfs2_dinode)); gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + sizeof(__be64)); ptr = (__be64 *)(mp->mp_bh[i]->b_data + sizeof(struct gfs2_meta_header)); *ptr = zero_bn; state = ALLOC_GROW_DEPTH; for(i = branch_start; i < mp->mp_fheight; i++) { if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); mp->mp_bh[i] = NULL; } i = branch_start; } if (n == 0) break; fallthrough; /* To branching from existing tree */ case ALLOC_GROW_DEPTH: if (i > 1 && i < mp->mp_fheight) gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); for (; i < mp->mp_fheight && n > 0; i++, n--) gfs2_indirect_init(mp, ip->i_gl, i, mp->mp_list[i-1], bn++); if (i == mp->mp_fheight) state = ALLOC_DATA; if (n == 0) break; fallthrough; /* To tree complete, adding data blocks */ case ALLOC_DATA: BUG_ON(n > dblks); BUG_ON(mp->mp_bh[end_of_metadata] == NULL); gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); dblks = n; ptr = metapointer(end_of_metadata, mp); iomap->addr = bn << inode->i_blkbits; iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW; while (n-- > 0) *ptr++ = cpu_to_be64(bn++); break; } } while (iomap->addr == IOMAP_NULL_ADDR); iomap->type = IOMAP_MAPPED; iomap->length = (u64)dblks << inode->i_blkbits; ip->i_height = mp->mp_fheight; gfs2_add_inode_blocks(&ip->i_inode, alloced); gfs2_dinode_out(ip, dibh->b_data); out: up_write(&ip->i_rw_mutex); return ret; } #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE /** * gfs2_alloc_size - Compute the maximum allocation size * @inode: The inode * @mp: The metapath * @size: Requested size in blocks * * Compute the maximum size of the next allocation at @mp. * * Returns: size in blocks */ static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); const __be64 *first, *ptr, *end; /* * For writes to stuffed files, this function is called twice via * __gfs2_iomap_get, before and after unstuffing. The size we return the * first time needs to be large enough to get the reservation and * allocation sizes right. The size we return the second time must * be exact or else __gfs2_iomap_alloc won't do the right thing. */ if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) { unsigned int maxsize = mp->mp_fheight > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; maxsize -= mp->mp_list[mp->mp_fheight - 1]; if (size > maxsize) size = maxsize; return size; } first = metapointer(ip->i_height - 1, mp); end = metaend(ip->i_height - 1, mp); if (end - first > size) end = first + size; for (ptr = first; ptr < end; ptr++) { if (*ptr) break; } return ptr - first; } /** * __gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode * @pos: Starting position in bytes * @length: Length to map, in bytes * @flags: iomap flags * @iomap: The iomap structure * @mp: The metapath * * Returns: errno */ static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); loff_t size = i_size_read(inode); __be64 *ptr; sector_t lblock; sector_t lblock_stop; int ret; int eob; u64 len; struct buffer_head *dibh = NULL, *bh; u8 height; if (!length) return -EINVAL; down_read(&ip->i_rw_mutex); ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) goto unlock; mp->mp_bh[0] = dibh; if (gfs2_is_stuffed(ip)) { if (flags & IOMAP_WRITE) { loff_t max_size = gfs2_max_stuffed_size(ip); if (pos + length > max_size) goto unstuff; iomap->length = max_size; } else { if (pos >= size) { if (flags & IOMAP_REPORT) { ret = -ENOENT; goto unlock; } else { iomap->offset = pos; iomap->length = length; goto hole_found; } } iomap->length = size; } iomap->addr = (ip->i_no_addr << inode->i_blkbits) + sizeof(struct gfs2_dinode); iomap->type = IOMAP_INLINE; iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode); goto out; } unstuff: lblock = pos >> inode->i_blkbits; iomap->offset = lblock << inode->i_blkbits; lblock_stop = (pos + length - 1) >> inode->i_blkbits; len = lblock_stop - lblock + 1; iomap->length = len << inode->i_blkbits; height = ip->i_height; while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) height++; find_metapath(sdp, lblock, mp, height); if (height > ip->i_height || gfs2_is_stuffed(ip)) goto do_alloc; ret = lookup_metapath(ip, mp); if (ret) goto unlock; if (mp->mp_aheight != ip->i_height) goto do_alloc; ptr = metapointer(ip->i_height - 1, mp); if (*ptr == 0) goto do_alloc; bh = mp->mp_bh[ip->i_height - 1]; len = gfs2_extent_length(bh, ptr, &eob); iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; iomap->length = len << inode->i_blkbits; iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; if (eob) iomap->flags |= IOMAP_F_GFS2_BOUNDARY; out: iomap->bdev = inode->i_sb->s_bdev; unlock: up_read(&ip->i_rw_mutex); return ret; do_alloc: if (flags & IOMAP_REPORT) { if (pos >= size) ret = -ENOENT; else if (height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else iomap->length = size - iomap->offset; } else if (flags & IOMAP_WRITE) { u64 alloc_size; if (flags & IOMAP_DIRECT) goto out; /* (see gfs2_file_direct_write) */ len = gfs2_alloc_size(inode, mp, len); alloc_size = len << inode->i_blkbits; if (alloc_size < iomap->length) iomap->length = alloc_size; } else { if (pos < size && height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); } hole_found: iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; goto out; } static struct folio * gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) { struct inode *inode = iter->inode; unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocks; struct folio *folio; int status; blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); if (status) return ERR_PTR(status); folio = iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) gfs2_trans_end(sdp); return folio; } static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, unsigned copied, struct folio *folio) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); if (!gfs2_is_stuffed(ip)) gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos), copied); folio_unlock(folio); folio_put(folio); if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); } static const struct iomap_folio_ops gfs2_iomap_folio_ops = { .get_folio = gfs2_iomap_get_folio, .put_folio = gfs2_iomap_put_folio, }; static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct metapath *mp) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); bool unstuff; int ret; unstuff = gfs2_is_stuffed(ip) && pos + length > gfs2_max_stuffed_size(ip); if (unstuff || iomap->type == IOMAP_HOLE) { unsigned int data_blocks, ind_blocks; struct gfs2_alloc_parms ap = {}; unsigned int rblocks; struct gfs2_trans *tr; gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_quota_lock_check(ip, &ap); if (ret) return ret; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_qunlock; rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks; if (ind_blocks || data_blocks) rblocks += RES_STATFS + RES_QUOTA; if (inode == sdp->sd_rindex) rblocks += 2 * RES_STATFS; rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits); if (ret) goto out_trans_fail; if (unstuff) { ret = gfs2_unstuff_dinode(ip); if (ret) goto out_trans_end; release_metapath(mp); ret = __gfs2_iomap_get(inode, iomap->offset, iomap->length, flags, iomap, mp); if (ret) goto out_trans_end; } if (iomap->type == IOMAP_HOLE) { ret = __gfs2_iomap_alloc(inode, iomap, mp); if (ret) { gfs2_trans_end(sdp); gfs2_inplace_release(ip); punch_hole(ip, iomap->offset, iomap->length); goto out_qunlock; } } tr = current->journal_info; if (tr->tr_num_buf_new) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); } if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) iomap->folio_ops = &gfs2_iomap_folio_ops; return 0; out_trans_end: gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); out_qunlock: gfs2_quota_unlock(ip); return ret; } static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct gfs2_inode *ip = GFS2_I(inode); struct metapath mp = { .mp_aheight = 1, }; int ret; if (gfs2_is_jdata(ip)) iomap->flags |= IOMAP_F_BUFFER_HEAD; trace_gfs2_iomap_start(ip, pos, length, flags); ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); if (ret) goto out_unlock; switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) { case IOMAP_WRITE: if (flags & IOMAP_DIRECT) { /* * Silently fall back to buffered I/O for stuffed files * or if we've got a hole (see gfs2_file_direct_write). */ if (iomap->type != IOMAP_MAPPED) ret = -ENOTBLK; goto out_unlock; } break; case IOMAP_ZERO: if (iomap->type == IOMAP_HOLE) goto out_unlock; break; default: goto out_unlock; } ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); out_unlock: release_metapath(&mp); trace_gfs2_iomap_end(ip, iomap, ret); return ret; } static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) { case IOMAP_WRITE: if (flags & IOMAP_DIRECT) return 0; break; case IOMAP_ZERO: if (iomap->type == IOMAP_HOLE) return 0; break; default: return 0; } if (!gfs2_is_stuffed(ip)) gfs2_ordered_add_inode(ip); if (inode == sdp->sd_rindex) adjust_fs_space(inode); gfs2_inplace_release(ip); if (ip->i_qadata && ip->i_qadata->qa_qd_num) gfs2_quota_unlock(ip); if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ loff_t hstart = round_up(pos + written, i_blocksize(inode)); loff_t hend = iomap->offset + iomap->length; if (hstart < hend) { truncate_pagecache_range(inode, hstart, hend - 1); punch_hole(ip, hstart, hend - hstart); } } if (unlikely(!written)) return 0; if (iomap->flags & IOMAP_F_SIZE_CHANGED) mark_inode_dirty(inode); set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); return 0; } const struct iomap_ops gfs2_iomap_ops = { .iomap_begin = gfs2_iomap_begin, .iomap_end = gfs2_iomap_end, }; /** * gfs2_block_map - Map one or more blocks of an inode to a disk block * @inode: The inode * @lblock: The logical block number * @bh_map: The bh to be mapped * @create: True if its ok to alloc blocks to satify the request * * The size of the requested mapping is defined in bh_map->b_size. * * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged * when @lblock is not mapped. Sets buffer_mapped(bh_map) and * bh_map->b_size to indicate the size of the mapping when @lblock and * successive blocks are mapped, up to the requested size. * * Sets buffer_boundary() if a read of metadata will be required * before the next block can be mapped. Sets buffer_new() if new * blocks were allocated. * * Returns: errno */ int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh_map, int create) { struct gfs2_inode *ip = GFS2_I(inode); loff_t pos = (loff_t)lblock << inode->i_blkbits; loff_t length = bh_map->b_size; struct iomap iomap = { }; int ret; clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); clear_buffer_boundary(bh_map); trace_gfs2_bmap(ip, bh_map, lblock, create, 1); if (!create) ret = gfs2_iomap_get(inode, pos, length, &iomap); else ret = gfs2_iomap_alloc(inode, pos, length, &iomap); if (ret) goto out; if (iomap.length > bh_map->b_size) { iomap.length = bh_map->b_size; iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY; } if (iomap.addr != IOMAP_NULL_ADDR) map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); bh_map->b_size = iomap.length; if (iomap.flags & IOMAP_F_GFS2_BOUNDARY) set_buffer_boundary(bh_map); if (iomap.flags & IOMAP_F_NEW) set_buffer_new(bh_map); out: trace_gfs2_bmap(ip, bh_map, lblock, create, ret); return ret; } int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, unsigned int *extlen) { unsigned int blkbits = inode->i_blkbits; struct iomap iomap = { }; unsigned int len; int ret; ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits, &iomap); if (ret) return ret; if (iomap.type != IOMAP_MAPPED) return -EIO; *dblock = iomap.addr >> blkbits; len = iomap.length >> blkbits; if (len < *extlen) *extlen = len; return 0; } int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, unsigned int *extlen, bool *new) { unsigned int blkbits = inode->i_blkbits; struct iomap iomap = { }; unsigned int len; int ret; ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits, &iomap); if (ret) return ret; if (iomap.type != IOMAP_MAPPED) return -EIO; *dblock = iomap.addr >> blkbits; len = iomap.length >> blkbits; if (len < *extlen) *extlen = len; *new = iomap.flags & IOMAP_F_NEW; return 0; } /* * NOTE: Never call gfs2_block_zero_range with an open transaction because it * uses iomap write to perform its actions, which begin their own transactions * (iomap_begin, get_folio, etc.) */ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); } #define GFS2_JTRUNC_REVOKES 8192 /** * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files * @inode: The inode being truncated * @oldsize: The original (larger) size * @newsize: The new smaller size * * With jdata files, we have to journal a revoke for each block which is * truncated. As a result, we need to split this into separate transactions * if the number of pages being truncated gets too large. */ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) { struct gfs2_sbd *sdp = GFS2_SB(inode); u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; u64 chunk; int error; while (oldsize != newsize) { struct gfs2_trans *tr; unsigned int offs; chunk = oldsize - newsize; if (chunk > max_chunk) chunk = max_chunk; offs = oldsize & ~PAGE_MASK; if (offs && chunk > PAGE_SIZE) chunk = offs + ((chunk - offs) & PAGE_MASK); truncate_pagecache(inode, oldsize - chunk); oldsize -= chunk; tr = current->journal_info; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) continue; gfs2_trans_end(sdp); error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); if (error) return error; } return 0; } static int trunc_start(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh = NULL; int journaled = gfs2_is_jdata(ip); u64 oldsize = inode->i_size; int error; if (!gfs2_is_stuffed(ip)) { unsigned int blocksize = i_blocksize(inode); unsigned int offs = newsize & (blocksize - 1); if (offs) { error = gfs2_block_zero_range(inode, newsize, blocksize - offs); if (error) return error; } } if (journaled) error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); else error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); else ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; i_size_write(inode, newsize); inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_dinode_out(ip, dibh->b_data); if (journaled) error = gfs2_journaled_truncate(inode, oldsize, newsize); else truncate_pagecache(inode, newsize); out: brelse(dibh); if (current->journal_info) gfs2_trans_end(sdp); return error; } int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, struct iomap *iomap) { struct metapath mp = { .mp_aheight = 1, }; int ret; ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp); release_metapath(&mp); return ret; } int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, struct iomap *iomap) { struct metapath mp = { .mp_aheight = 1, }; int ret; ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp); if (!ret && iomap->type == IOMAP_HOLE) ret = __gfs2_iomap_alloc(inode, iomap, &mp); release_metapath(&mp); return ret; } /** * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein * @ip: inode * @rd_gh: holder of resource group glock * @bh: buffer head to sweep * @start: starting point in bh * @end: end point in bh * @meta: true if bh points to metadata (rather than data) * @btotal: place to keep count of total blocks freed * * We sweep a metadata buffer (provided by the metapath) for blocks we need to * free, and free them all. However, we do it one rgrp at a time. If this * block has references to multiple rgrps, we break it into individual * transactions. This allows other processes to use the rgrps while we're * focused on a single one, for better concurrency / performance. * At every transaction boundary, we rewrite the inode into the journal. * That way the bitmaps are kept consistent with the inode and we can recover * if we're interrupted by power-outages. * * Returns: 0, or return code if an error occurred. * *btotal has the total number of blocks freed */ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, struct buffer_head *bh, __be64 *start, __be64 *end, bool meta, u32 *btotal) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; struct gfs2_trans *tr; __be64 *p; int blks_outside_rgrp; u64 bn, bstart, isize_blks; s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */ int ret = 0; bool buf_in_tr = false; /* buffer was added to transaction */ more_rgrps: rgd = NULL; if (gfs2_holder_initialized(rd_gh)) { rgd = gfs2_glock2rgrp(rd_gh->gh_gl); gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); } blks_outside_rgrp = 0; bstart = 0; blen = 0; for (p = start; p < end; p++) { if (!*p) continue; bn = be64_to_cpu(*p); if (rgd) { if (!rgrp_contains_block(rgd, bn)) { blks_outside_rgrp++; continue; } } else { rgd = gfs2_blk2rgrpd(sdp, bn, true); if (unlikely(!rgd)) { ret = -EIO; goto out; } ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, rd_gh); if (ret) goto out; /* Must be done with the rgrp glock held: */ if (gfs2_rs_active(&ip->i_res) && rgd == ip->i_res.rs_rgd) gfs2_rs_deltree(&ip->i_res); } /* The size of our transactions will be unknown until we actually process all the metadata blocks that relate to the rgrp. So we estimate. We know it can't be more than the dinode's i_blocks and we don't want to exceed the journal flush threshold, sd_log_thresh2. */ if (current->journal_info == NULL) { unsigned int jblocks_rqsted, revokes; jblocks_rqsted = rgd->rd_length + RES_DINODE + RES_INDIRECT; isize_blks = gfs2_get_inode_blocks(&ip->i_inode); if (isize_blks > atomic_read(&sdp->sd_log_thresh2)) jblocks_rqsted += atomic_read(&sdp->sd_log_thresh2); else jblocks_rqsted += isize_blks; revokes = jblocks_rqsted; if (meta) revokes += end - start; else if (ip->i_depth) revokes += sdp->sd_inptrs; ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); if (ret) goto out_unlock; down_write(&ip->i_rw_mutex); } /* check if we will exceed the transaction blocks requested */ tr = current->journal_info; if (tr->tr_num_buf_new + RES_STATFS + RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { /* We set blks_outside_rgrp to ensure the loop will be repeated for the same rgrp, but with a new transaction. */ blks_outside_rgrp++; /* This next part is tricky. If the buffer was added to the transaction, we've already set some block pointers to 0, so we better follow through and free them, or we will introduce corruption (so break). This may be impossible, or at least rare, but I decided to cover the case regardless. If the buffer was not added to the transaction (this call), doing so would exceed our transaction size, so we need to end the transaction and start a new one (so goto). */ if (buf_in_tr) break; goto out_unlock; } gfs2_trans_add_meta(ip->i_gl, bh); buf_in_tr = true; *p = 0; if (bstart + blen == bn) { blen++; continue; } if (bstart) { __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); (*btotal) += blen; gfs2_add_inode_blocks(&ip->i_inode, -blen); } bstart = bn; blen = 1; } if (bstart) { __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); (*btotal) += blen; gfs2_add_inode_blocks(&ip->i_inode, -blen); } out_unlock: if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks outside the rgrp we just processed, do it all over again. */ if (current->journal_info) { struct buffer_head *dibh; ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) goto out; /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); buf_in_tr = false; } gfs2_glock_dq_uninit(rd_gh); cond_resched(); goto more_rgrps; } out: return ret; } static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h) { if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0]))) return false; return true; } /** * find_nonnull_ptr - find a non-null pointer given a metapath and height * @sdp: The superblock * @mp: starting metapath * @h: desired height to search * @end_list: See punch_hole(). * @end_aligned: See punch_hole(). * * Assumes the metapath is valid (with buffers) out to height h. * Returns: true if a non-null pointer was found in the metapath buffer * false if all remaining pointers are NULL in the buffer */ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, unsigned int h, __u16 *end_list, unsigned int end_aligned) { struct buffer_head *bh = mp->mp_bh[h]; __be64 *first, *ptr, *end; first = metaptr1(h, mp); ptr = first + mp->mp_list[h]; end = (__be64 *)(bh->b_data + bh->b_size); if (end_list && mp_eq_to_hgt(mp, end_list, h)) { bool keep_end = h < end_aligned; end = first + end_list[h] + keep_end; } while (ptr < end) { if (*ptr) { /* if we have a non-null pointer */ mp->mp_list[h] = ptr - first; h++; if (h < GFS2_MAX_META_HEIGHT) mp->mp_list[h] = 0; return true; } ptr++; } return false; } enum dealloc_states { DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */ DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */ DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */ DEALLOC_DONE = 3, /* process complete */ }; static inline void metapointer_range(struct metapath *mp, int height, __u16 *start_list, unsigned int start_aligned, __u16 *end_list, unsigned int end_aligned, __be64 **start, __be64 **end) { struct buffer_head *bh = mp->mp_bh[height]; __be64 *first; first = metaptr1(height, mp); *start = first; if (mp_eq_to_hgt(mp, start_list, height)) { bool keep_start = height < start_aligned; *start = first + start_list[height] + keep_start; } *end = (__be64 *)(bh->b_data + bh->b_size); if (end_list && mp_eq_to_hgt(mp, end_list, height)) { bool keep_end = height < end_aligned; *end = first + end_list[height] + keep_end; } } static inline bool walk_done(struct gfs2_sbd *sdp, struct metapath *mp, int height, __u16 *end_list, unsigned int end_aligned) { __u16 end; if (end_list) { bool keep_end = height < end_aligned; if (!mp_eq_to_hgt(mp, end_list, height)) return false; end = end_list[height] + keep_end; } else end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs; return mp->mp_list[height] >= end; } /** * punch_hole - deallocate blocks in a file * @ip: inode to truncate * @offset: the start of the hole * @length: the size of the hole (or 0 for truncate) * * Punch a hole into a file or truncate a file at a given position. This * function operates in whole blocks (@offset and @length are rounded * accordingly); partially filled blocks must be cleared otherwise. * * This function works from the bottom up, and from the right to the left. In * other words, it strips off the highest layer (data) before stripping any of * the metadata. Doing it this way is best in case the operation is interrupted * by power failure, etc. The dinode is rewritten in every transaction to * guarantee integrity. */ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u64 maxsize = sdp->sd_heightsize[ip->i_height]; struct metapath mp = {}; struct buffer_head *dibh, *bh; struct gfs2_holder rd_gh; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int bsize = 1 << bsize_shift; u64 lblock = (offset + bsize - 1) >> bsize_shift; __u16 start_list[GFS2_MAX_META_HEIGHT]; __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; unsigned int start_aligned, end_aligned; unsigned int strip_h = ip->i_height - 1; u32 btotal = 0; int ret, state; int mp_h; /* metapath buffers are read in to this height */ u64 prev_bnr = 0; __be64 *start, *end; if (offset + bsize - 1 >= maxsize) { /* * The starting point lies beyond the allocated metadata; * there are no blocks to deallocate. */ return 0; } /* * The start position of the hole is defined by lblock, start_list, and * start_aligned. The end position of the hole is defined by lend, * end_list, and end_aligned. * * start_aligned and end_aligned define down to which height the start * and end positions are aligned to the metadata tree (i.e., the * position is a multiple of the metadata granularity at the height * above). This determines at which heights additional meta pointers * needs to be preserved for the remaining data. */ if (length) { u64 end_offset = offset + length; u64 lend; /* * Clip the end at the maximum file size for the given height: * that's how far the metadata goes; files bigger than that * will have additional layers of indirection. */ if (end_offset > maxsize) end_offset = maxsize; lend = end_offset >> bsize_shift; if (lblock >= lend) return 0; find_metapath(sdp, lend, &mp, ip->i_height); end_list = __end_list; memcpy(end_list, mp.mp_list, sizeof(mp.mp_list)); for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { if (end_list[mp_h]) break; } end_aligned = mp_h; } find_metapath(sdp, lblock, &mp, ip->i_height); memcpy(start_list, mp.mp_list, sizeof(start_list)); for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { if (start_list[mp_h]) break; } start_aligned = mp_h; ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) return ret; mp.mp_bh[0] = dibh; ret = lookup_metapath(ip, &mp); if (ret) goto out_metapath; /* issue read-ahead on metadata */ for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) { metapointer_range(&mp, mp_h, start_list, start_aligned, end_list, end_aligned, &start, &end); gfs2_metapath_ra(ip->i_gl, start, end); } if (mp.mp_aheight == ip->i_height) state = DEALLOC_MP_FULL; /* We have a complete metapath */ else state = DEALLOC_FILL_MP; /* deal with partial metapath */ ret = gfs2_rindex_update(sdp); if (ret) goto out_metapath; ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) goto out_metapath; gfs2_holder_mark_uninitialized(&rd_gh); mp_h = strip_h; while (state != DEALLOC_DONE) { switch (state) { /* Truncate a full metapath at the given strip height. * Note that strip_h == mp_h in order to be in this state. */ case DEALLOC_MP_FULL: bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, prev_bnr != bh->b_blocknr)) { fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u, " "s_h:%u, mp_h:%u\n", (unsigned long long)ip->i_no_addr, prev_bnr, ip->i_height, strip_h, mp_h); } prev_bnr = bh->b_blocknr; if (gfs2_metatype_check(sdp, bh, (mp_h ? GFS2_METATYPE_IN : GFS2_METATYPE_DI))) { ret = -EIO; goto out; } /* * Below, passing end_aligned as 0 gives us the * metapointer range excluding the end point: the end * point is the first metapath we must not deallocate! */ metapointer_range(&mp, mp_h, start_list, start_aligned, end_list, 0 /* end_aligned */, &start, &end); ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h], start, end, mp_h != ip->i_height - 1, &btotal); /* If we hit an error or just swept dinode buffer, just exit. */ if (ret || !mp_h) { state = DEALLOC_DONE; break; } state = DEALLOC_MP_LOWER; break; /* lower the metapath strip height */ case DEALLOC_MP_LOWER: /* We're done with the current buffer, so release it, unless it's the dinode buffer. Then back up to the previous pointer. */ if (mp_h) { brelse(mp.mp_bh[mp_h]); mp.mp_bh[mp_h] = NULL; } /* If we can't get any lower in height, we've stripped off all we can. Next step is to back up and start stripping the previous level of metadata. */ if (mp_h == 0) { strip_h--; memcpy(mp.mp_list, start_list, sizeof(start_list)); mp_h = strip_h; state = DEALLOC_FILL_MP; break; } mp.mp_list[mp_h] = 0; mp_h--; /* search one metadata height down */ mp.mp_list[mp_h]++; if (walk_done(sdp, &mp, mp_h, end_list, end_aligned)) break; /* Here we've found a part of the metapath that is not * allocated. We need to search at that height for the * next non-null pointer. */ if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) { state = DEALLOC_FILL_MP; mp_h++; } /* No more non-null pointers at this height. Back up to the previous height and try again. */ break; /* loop around in the same state */ /* Fill the metapath with buffers to the given height. */ case DEALLOC_FILL_MP: /* Fill the buffers out to the current height. */ ret = fillup_metapath(ip, &mp, mp_h); if (ret < 0) goto out; /* On the first pass, issue read-ahead on metadata. */ if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) { unsigned int height = mp.mp_aheight - 1; /* No read-ahead for data blocks. */ if (mp.mp_aheight - 1 == strip_h) height--; for (; height >= mp.mp_aheight - ret; height--) { metapointer_range(&mp, height, start_list, start_aligned, end_list, end_aligned, &start, &end); gfs2_metapath_ra(ip->i_gl, start, end); } } /* If buffers found for the entire strip height */ if (mp.mp_aheight - 1 == strip_h) { state = DEALLOC_MP_FULL; break; } if (mp.mp_aheight < ip->i_height) /* We have a partial height */ mp_h = mp.mp_aheight - 1; /* If we find a non-null block pointer, crawl a bit higher up in the metapath and try again, otherwise we need to look lower for a new starting point. */ if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) mp_h++; else state = DEALLOC_MP_LOWER; break; } } if (btotal) { if (current->journal_info == NULL) { ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_QUOTA, 0); if (ret) goto out; down_write(&ip->i_rw_mutex); } gfs2_statfs_change(sdp, 0, +btotal, 0); gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); } out: if (gfs2_holder_initialized(&rd_gh)) gfs2_glock_dq_uninit(&rd_gh); if (current->journal_info) { up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); cond_resched(); } gfs2_quota_unhold(ip); out_metapath: release_metapath(&mp); return ret; } static int trunc_end(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; int error; error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; down_write(&ip->i_rw_mutex); error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; if (!i_size_read(&ip->i_inode)) { ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); gfs2_ordered_del_inode(ip); } inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); out: up_write(&ip->i_rw_mutex); gfs2_trans_end(sdp); return error; } /** * do_shrink - make a file smaller * @inode: the inode * @newsize: the size to make the file * * Called with an exclusive lock on @inode. The @size must * be equal to or smaller than the current inode size. * * Returns: errno */ static int do_shrink(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); int error; error = trunc_start(inode, newsize); if (error < 0) return error; if (gfs2_is_stuffed(ip)) return 0; error = punch_hole(ip, newsize, 0); if (error == 0) error = trunc_end(ip); return error; } /** * do_grow - Touch and update inode size * @inode: The inode * @size: The new size * * This function updates the timestamps on the inode and * may also increase the size of the inode. This function * must not be called with @size any smaller than the current * inode size. * * Although it is not strictly required to unstuff files here, * earlier versions of GFS2 have a bug in the stuffed file reading * code which will result in a buffer overrun if the size is larger * than the max stuffed file size. In order to prevent this from * occurring, such files are unstuffed, but in other cases we can * just update the inode size directly. * * Returns: 0 on success, or -ve on error */ static int do_grow(struct inode *inode, u64 size) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .target = 1, }; struct buffer_head *dibh; int error; int unstuff = 0; if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) { error = gfs2_quota_lock_check(ip, &ap); if (error) return error; error = gfs2_inplace_reserve(ip, &ap); if (error) goto do_grow_qunlock; unstuff = 1; } error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + (unstuff && gfs2_is_jdata(ip) ? RES_JDATA : 0) + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? 0 : RES_QUOTA), 0); if (error) goto do_grow_release; if (unstuff) { error = gfs2_unstuff_dinode(ip); if (error) goto do_end_trans; } error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto do_end_trans; truncate_setsize(inode, size); inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); do_end_trans: gfs2_trans_end(sdp); do_grow_release: if (unstuff) { gfs2_inplace_release(ip); do_grow_qunlock: gfs2_quota_unlock(ip); } return error; } /** * gfs2_setattr_size - make a file a given size * @inode: the inode * @newsize: the size to make the file * * The file size can grow, shrink, or stay the same size. This * is called holding i_rwsem and an exclusive glock on the inode * in question. * * Returns: errno */ int gfs2_setattr_size(struct inode *inode, u64 newsize) { struct gfs2_inode *ip = GFS2_I(inode); int ret; BUG_ON(!S_ISREG(inode->i_mode)); ret = inode_newsize_ok(inode, newsize); if (ret) return ret; inode_dio_wait(inode); ret = gfs2_qa_get(ip); if (ret) goto out; if (newsize >= inode->i_size) { ret = do_grow(inode, newsize); goto out; } ret = do_shrink(inode, newsize); out: gfs2_rs_delete(ip); gfs2_qa_put(ip); return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; error = punch_hole(ip, i_size_read(&ip->i_inode), 0); if (!error) error = trunc_end(ip); return error; } int gfs2_file_dealloc(struct gfs2_inode *ip) { return punch_hole(ip, 0, 0); } /** * gfs2_free_journal_extents - Free cached journal bmap info * @jd: The journal * */ void gfs2_free_journal_extents(struct gfs2_jdesc *jd) { struct gfs2_journal_extent *jext; while(!list_empty(&jd->extent_list)) { jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list); list_del(&jext->list); kfree(jext); } } /** * gfs2_add_jextent - Add or merge a new extent to extent cache * @jd: The journal descriptor * @lblock: The logical block at start of new extent * @dblock: The physical block at start of new extent * @blocks: Size of extent in fs blocks * * Returns: 0 on success or -ENOMEM */ static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks) { struct gfs2_journal_extent *jext; if (!list_empty(&jd->extent_list)) { jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list); if ((jext->dblock + jext->blocks) == dblock) { jext->blocks += blocks; return 0; } } jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS); if (jext == NULL) return -ENOMEM; jext->dblock = dblock; jext->lblock = lblock; jext->blocks = blocks; list_add_tail(&jext->list, &jd->extent_list); jd->nr_extents++; return 0; } /** * gfs2_map_journal_extents - Cache journal bmap info * @sdp: The super block * @jd: The journal to map * * Create a reusable "extent" mapping from all logical * blocks to all physical blocks for the given journal. This will save * us time when writing journal blocks. Most journals will have only one * extent that maps all their logical blocks. That's because gfs2.mkfs * arranges the journal blocks sequentially to maximize performance. * So the extent would map the first block for the entire file length. * However, gfs2_jadd can happen while file activity is happening, so * those journals may not be sequential. Less likely is the case where * the users created their own journals by mounting the metafs and * laying it out. But it's still possible. These journals might have * several extents. * * Returns: 0 on success, or error on failure */ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) { u64 lblock = 0; u64 lblock_stop; struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct buffer_head bh; unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; ktime_t start, end; start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; WARN_ON(!list_empty(&jd->extent_list)); do { bh.b_state = 0; bh.b_blocknr = 0; bh.b_size = size; rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0); if (rc || !buffer_mapped(&bh)) goto fail; rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift); if (rc) goto fail; size -= bh.b_size; lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); end = ktime_get(); fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n", rc, jd->jd_jid, (unsigned long long)(i_size_read(jd->jd_inode) - size), jd->nr_extents); fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n", rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr, bh.b_state, (unsigned long long)bh.b_size); gfs2_free_journal_extents(jd); return rc; } /** * gfs2_write_alloc_required - figure out if a write will require an allocation * @ip: the file being written to * @offset: the offset to write to * @len: the number of bytes being written * * Returns: 1 if an alloc is required, 0 otherwise */ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head bh; unsigned int shift; u64 lblock, lblock_stop, size; u64 end_of_file; if (!len) return 0; if (gfs2_is_stuffed(ip)) { if (offset + len > gfs2_max_stuffed_size(ip)) return 1; return 0; } shift = sdp->sd_sb.sb_bsize_shift; BUG_ON(gfs2_is_dir(ip)); end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex)) return 1; size = (lblock_stop - lblock) << shift; do { bh.b_state = 0; bh.b_size = size; gfs2_block_map(&ip->i_inode, lblock, &bh, 0); if (!buffer_mapped(&bh)) return 1; size -= bh.b_size; lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); return 0; } static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *dibh; int error; if (offset >= inode->i_size) return 0; if (offset + length > inode->i_size) length = inode->i_size - offset; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) return error; gfs2_trans_add_meta(ip->i_gl, dibh); memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0, length); brelse(dibh); return 0; } static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset, loff_t length) { struct gfs2_sbd *sdp = GFS2_SB(inode); loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; int error; while (length) { struct gfs2_trans *tr; loff_t chunk; unsigned int offs; chunk = length; if (chunk > max_chunk) chunk = max_chunk; offs = offset & ~PAGE_MASK; if (offs && chunk > PAGE_SIZE) chunk = offs + ((chunk - offs) & PAGE_MASK); truncate_pagecache_range(inode, offset, chunk); offset += chunk; length -= chunk; tr = current->journal_info; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) continue; gfs2_trans_end(sdp); error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); if (error) return error; } return 0; } int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int blocksize = i_blocksize(inode); loff_t start, end; int error; if (!gfs2_is_stuffed(ip)) { unsigned int start_off, end_len; start_off = offset & (blocksize - 1); end_len = (offset + length) & (blocksize - 1); if (start_off) { unsigned int len = length; if (length > blocksize - start_off) len = blocksize - start_off; error = gfs2_block_zero_range(inode, offset, len); if (error) goto out; if (start_off + length < blocksize) end_len = 0; } if (end_len) { error = gfs2_block_zero_range(inode, offset + length - end_len, end_len); if (error) goto out; } } start = round_down(offset, blocksize); end = round_up(offset + length, blocksize) - 1; error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; if (gfs2_is_jdata(ip)) error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, GFS2_JTRUNC_REVOKES); else error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) return error; if (gfs2_is_stuffed(ip)) { error = stuffed_zero_range(inode, offset, length); if (error) goto out; } if (gfs2_is_jdata(ip)) { BUG_ON(!current->journal_info); gfs2_journaled_truncate_range(inode, offset, length); } else truncate_pagecache_range(inode, offset, offset + length - 1); file_update_time(file); mark_inode_dirty(inode); if (current->journal_info) gfs2_trans_end(sdp); if (!gfs2_is_stuffed(ip)) error = punch_hole(ip, offset, length); out: if (current->journal_info) gfs2_trans_end(sdp); return error; } static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned int len) { int ret; if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) return -EIO; if (offset >= wpc->iomap.offset && offset < wpc->iomap.offset + wpc->iomap.length) return 0; memset(&wpc->iomap, 0, sizeof(wpc->iomap)); ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap); return ret; } const struct iomap_writeback_ops gfs2_writeback_ops = { .map_blocks = gfs2_map_blocks, };
4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 // SPDX-License-Identifier: GPL-2.0-or-later /* * SPCA508 chip based cameras subdriver * * Copyright (C) 2009 Jean-Francois Moine <http://moinejf.free.fr> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "spca508" #include "gspca.h" MODULE_AUTHOR("Michel Xhaard <mxhaard@users.sourceforge.net>"); MODULE_DESCRIPTION("GSPCA/SPCA508 USB Camera Driver"); MODULE_LICENSE("GPL"); /* specific webcam descriptor */ struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ u8 subtype; #define CreativeVista 0 #define HamaUSBSightcam 1 #define HamaUSBSightcam2 2 #define IntelEasyPCCamera 3 #define MicroInnovationIC200 4 #define ViewQuestVQ110 5 }; static const struct v4l2_pix_format sif_mode[] = { {160, 120, V4L2_PIX_FMT_SPCA508, V4L2_FIELD_NONE, .bytesperline = 160, .sizeimage = 160 * 120 * 3 / 2, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 3}, {176, 144, V4L2_PIX_FMT_SPCA508, V4L2_FIELD_NONE, .bytesperline = 176, .sizeimage = 176 * 144 * 3 / 2, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 2}, {320, 240, V4L2_PIX_FMT_SPCA508, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240 * 3 / 2, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 1}, {352, 288, V4L2_PIX_FMT_SPCA508, V4L2_FIELD_NONE, .bytesperline = 352, .sizeimage = 352 * 288 * 3 / 2, .colorspace = V4L2_COLORSPACE_SRGB, .priv = 0}, }; /* Frame packet header offsets for the spca508 */ #define SPCA508_OFFSET_DATA 37 /* * Initialization data: this is the first set-up data written to the * device (before the open data). */ static const u16 spca508_init_data[][2] = { {0x0000, 0x870b}, {0x0020, 0x8112}, /* Video drop enable, ISO streaming disable */ {0x0003, 0x8111}, /* Reset compression & memory */ {0x0000, 0x8110}, /* Disable all outputs */ /* READ {0x0000, 0x8114} -> 0000: 00 */ {0x0000, 0x8114}, /* SW GPIO data */ {0x0008, 0x8110}, /* Enable charge pump output */ {0x0002, 0x8116}, /* 200 kHz pump clock */ /* UNKNOWN DIRECTION (URB_FUNCTION_SELECT_INTERFACE:) */ {0x0003, 0x8111}, /* Reset compression & memory */ {0x0000, 0x8111}, /* Normal mode (not reset) */ {0x0098, 0x8110}, /* Enable charge pump output, sync.serial,external 2x clock */ {0x000d, 0x8114}, /* SW GPIO data */ {0x0002, 0x8116}, /* 200 kHz pump clock */ {0x0020, 0x8112}, /* Video drop enable, ISO streaming disable */ /* --------------------------------------- */ {0x000f, 0x8402}, /* memory bank */ {0x0000, 0x8403}, /* ... address */ /* --------------------------------------- */ /* 0x88__ is Synchronous Serial Interface. */ /* TBD: This table could be expressed more compactly */ /* using spca508_write_i2c_vector(). */ /* TBD: Should see if the values in spca50x_i2c_data */ /* would work with the VQ110 instead of the values */ /* below. */ {0x00c0, 0x8804}, /* SSI slave addr */ {0x0008, 0x8802}, /* 375 Khz SSI clock */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, /* 375 Khz SSI clock */ {0x0012, 0x8801}, /* SSI reg addr */ {0x0080, 0x8800}, /* SSI data to write */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, /* 375 Khz SSI clock */ {0x0012, 0x8801}, /* SSI reg addr */ {0x0000, 0x8800}, /* SSI data to write */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, /* 375 Khz SSI clock */ {0x0011, 0x8801}, /* SSI reg addr */ {0x0040, 0x8800}, /* SSI data to write */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0013, 0x8801}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0014, 0x8801}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0015, 0x8801}, {0x0001, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0016, 0x8801}, {0x0003, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0017, 0x8801}, {0x0036, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0018, 0x8801}, {0x00ec, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x001a, 0x8801}, {0x0094, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x001b, 0x8801}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0027, 0x8801}, {0x00a2, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0028, 0x8801}, {0x0040, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x002a, 0x8801}, {0x0084, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x002b, 0x8801}, {0x00a8, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x002c, 0x8801}, {0x00fe, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x002d, 0x8801}, {0x0003, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0038, 0x8801}, {0x0083, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0033, 0x8801}, {0x0081, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0034, 0x8801}, {0x004a, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0039, 0x8801}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0010, 0x8801}, {0x00a8, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0006, 0x8801}, {0x0058, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0000, 0x8801}, {0x0004, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0040, 0x8801}, {0x0080, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0041, 0x8801}, {0x000c, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0042, 0x8801}, {0x000c, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0043, 0x8801}, {0x0028, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0044, 0x8801}, {0x0080, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0045, 0x8801}, {0x0020, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0046, 0x8801}, {0x0020, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0047, 0x8801}, {0x0080, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0048, 0x8801}, {0x004c, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x0049, 0x8801}, {0x0084, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x004a, 0x8801}, {0x0084, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x0008, 0x8802}, {0x004b, 0x8801}, {0x0084, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* --------------------------------------- */ {0x0012, 0x8700}, /* Clock speed 48Mhz/(2+2)/2= 6 Mhz */ {0x0000, 0x8701}, /* CKx1 clock delay adj */ {0x0000, 0x8701}, /* CKx1 clock delay adj */ {0x0001, 0x870c}, /* CKOx2 output */ /* --------------------------------------- */ {0x0080, 0x8600}, /* Line memory read counter (L) */ {0x0001, 0x8606}, /* reserved */ {0x0064, 0x8607}, /* Line memory read counter (H) 0x6480=25,728 */ {0x002a, 0x8601}, /* CDSP sharp interpolation mode, * line sel for color sep, edge enhance enab */ {0x0000, 0x8602}, /* optical black level for user settng = 0 */ {0x0080, 0x8600}, /* Line memory read counter (L) */ {0x000a, 0x8603}, /* optical black level calc mode: * auto; optical black offset = 10 */ {0x00df, 0x865b}, /* Horiz offset for valid pixels (L)=0xdf */ {0x0012, 0x865c}, /* Vert offset for valid lines (L)=0x12 */ /* The following two lines seem to be the "wrong" resolution. */ /* But perhaps these indicate the actual size of the sensor */ /* rather than the size of the current video mode. */ {0x0058, 0x865d}, /* Horiz valid pixels (*4) (L) = 352 */ {0x0048, 0x865e}, /* Vert valid lines (*4) (L) = 288 */ {0x0015, 0x8608}, /* A11 Coef ... */ {0x0030, 0x8609}, {0x00fb, 0x860a}, {0x003e, 0x860b}, {0x00ce, 0x860c}, {0x00f4, 0x860d}, {0x00eb, 0x860e}, {0x00dc, 0x860f}, {0x0039, 0x8610}, {0x0001, 0x8611}, /* R offset for white balance ... */ {0x0000, 0x8612}, {0x0001, 0x8613}, {0x0000, 0x8614}, {0x005b, 0x8651}, /* R gain for white balance ... */ {0x0040, 0x8652}, {0x0060, 0x8653}, {0x0040, 0x8654}, {0x0000, 0x8655}, {0x0001, 0x863f}, /* Fixed gamma correction enable, USB control, * lum filter disable, lum noise clip disable */ {0x00a1, 0x8656}, /* Window1 size 256x256, Windows2 size 64x64, * gamma look-up disable, * new edge enhancement enable */ {0x0018, 0x8657}, /* Edge gain high thresh */ {0x0020, 0x8658}, /* Edge gain low thresh */ {0x000a, 0x8659}, /* Edge bandwidth high threshold */ {0x0005, 0x865a}, /* Edge bandwidth low threshold */ /* -------------------------------- */ {0x0030, 0x8112}, /* Video drop enable, ISO streaming enable */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0xa908, 0x8802}, {0x0034, 0x8801}, /* SSI reg addr */ {0x00ca, 0x8800}, /* SSI data to write */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0x1f08, 0x8802}, {0x0006, 0x8801}, {0x0080, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* ----- Read back coefs we wrote earlier. */ /* READ { 0x0000, 0x8608 } -> 0000: 15 */ /* READ { 0x0000, 0x8609 } -> 0000: 30 */ /* READ { 0x0000, 0x860a } -> 0000: fb */ /* READ { 0x0000, 0x860b } -> 0000: 3e */ /* READ { 0x0000, 0x860c } -> 0000: ce */ /* READ { 0x0000, 0x860d } -> 0000: f4 */ /* READ { 0x0000, 0x860e } -> 0000: eb */ /* READ { 0x0000, 0x860f } -> 0000: dc */ /* READ { 0x0000, 0x8610 } -> 0000: 39 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 08 */ {0xb008, 0x8802}, {0x0006, 0x8801}, {0x007d, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* This chunk is seemingly redundant with */ /* earlier commands (A11 Coef...), but if I disable it, */ /* the image appears too dark. Maybe there was some kind of */ /* reset since the earlier commands, so this is necessary again. */ {0x0015, 0x8608}, {0x0030, 0x8609}, {0xfffb, 0x860a}, {0x003e, 0x860b}, {0xffce, 0x860c}, {0xfff4, 0x860d}, {0xffeb, 0x860e}, {0xffdc, 0x860f}, {0x0039, 0x8610}, {0x0018, 0x8657}, {0x0000, 0x8508}, /* Disable compression. */ /* Previous line was: {0x0021, 0x8508}, * Enable compression. */ {0x0032, 0x850b}, /* compression stuff */ {0x0003, 0x8509}, /* compression stuff */ {0x0011, 0x850a}, /* compression stuff */ {0x0021, 0x850d}, /* compression stuff */ {0x0010, 0x850c}, /* compression stuff */ {0x0003, 0x8500}, /* *** Video mode: 160x120 */ {0x0001, 0x8501}, /* Hardware-dominated snap control */ {0x0061, 0x8656}, /* Window1 size 128x128, Windows2 size 128x128, * gamma look-up disable, * new edge enhancement enable */ {0x0018, 0x8617}, /* Window1 start X (*2) */ {0x0008, 0x8618}, /* Window1 start Y (*2) */ {0x0061, 0x8656}, /* Window1 size 128x128, Windows2 size 128x128, * gamma look-up disable, * new edge enhancement enable */ {0x0058, 0x8619}, /* Window2 start X (*2) */ {0x0008, 0x861a}, /* Window2 start Y (*2) */ {0x00ff, 0x8615}, /* High lum thresh for white balance */ {0x0000, 0x8616}, /* Low lum thresh for white balance */ {0x0012, 0x8700}, /* Clock speed 48Mhz/(2+2)/2= 6 Mhz */ {0x0012, 0x8700}, /* Clock speed 48Mhz/(2+2)/2= 6 Mhz */ /* READ { 0x0000, 0x8656 } -> 0000: 61 */ {0x0028, 0x8802}, /* 375 Khz SSI clock, SSI r/w sync with VSYNC */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 28 */ {0x1f28, 0x8802}, /* 375 Khz SSI clock, SSI r/w sync with VSYNC */ {0x0010, 0x8801}, /* SSI reg addr */ {0x003e, 0x8800}, /* SSI data to write */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ {0x0028, 0x8802}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 28 */ {0x1f28, 0x8802}, {0x0000, 0x8801}, {0x001f, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ {0x0001, 0x8602}, /* optical black level for user settning = 1 */ /* Original: */ {0x0023, 0x8700}, /* Clock speed 48Mhz/(3+2)/4= 2.4 Mhz */ {0x000f, 0x8602}, /* optical black level for user settning = 15 */ {0x0028, 0x8802}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 28 */ {0x1f28, 0x8802}, {0x0010, 0x8801}, {0x007b, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ {0x002f, 0x8651}, /* R gain for white balance ... */ {0x0080, 0x8653}, /* READ { 0x0000, 0x8655 } -> 0000: 00 */ {0x0000, 0x8655}, {0x0030, 0x8112}, /* Video drop enable, ISO streaming enable */ {0x0020, 0x8112}, /* Video drop enable, ISO streaming disable */ /* UNKNOWN DIRECTION (URB_FUNCTION_SELECT_INTERFACE: (ALT=0) ) */ {} }; /* * Initialization data for Intel EasyPC Camera CS110 */ static const u16 spca508cs110_init_data[][2] = { {0x0000, 0x870b}, /* Reset CTL3 */ {0x0003, 0x8111}, /* Soft Reset compression, memory, TG & CDSP */ {0x0000, 0x8111}, /* Normal operation on reset */ {0x0090, 0x8110}, /* External Clock 2x & Synchronous Serial Interface Output */ {0x0020, 0x8112}, /* Video Drop packet enable */ {0x0000, 0x8114}, /* Software GPIO output data */ {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0003, 0x8114}, /* Initial sequence Synchronous Serial Interface */ {0x000f, 0x8402}, /* Memory bank Address */ {0x0000, 0x8403}, /* Memory bank Address */ {0x00ba, 0x8804}, /* SSI Slave address */ {0x0010, 0x8802}, /* 93.75kHz SSI Clock Two DataByte */ {0x0010, 0x8802}, /* 93.75kHz SSI Clock two DataByte */ {0x0001, 0x8801}, {0x000a, 0x8805}, /* a - NWG: Dunno what this is about */ {0x0000, 0x8800}, {0x0010, 0x8802}, {0x0002, 0x8801}, {0x0000, 0x8805}, {0x0000, 0x8800}, {0x0010, 0x8802}, {0x0003, 0x8801}, {0x0027, 0x8805}, {0x0001, 0x8800}, {0x0010, 0x8802}, {0x0004, 0x8801}, {0x0065, 0x8805}, {0x0001, 0x8800}, {0x0010, 0x8802}, {0x0005, 0x8801}, {0x0003, 0x8805}, {0x0000, 0x8800}, {0x0010, 0x8802}, {0x0006, 0x8801}, {0x001c, 0x8805}, {0x0000, 0x8800}, {0x0010, 0x8802}, {0x0007, 0x8801}, {0x002a, 0x8805}, {0x0000, 0x8800}, {0x0010, 0x8802}, {0x0002, 0x8704}, /* External input CKIx1 */ {0x0001, 0x8606}, /* 1 Line memory Read Counter (H) Result: (d)410 */ {0x009a, 0x8600}, /* Line memory Read Counter (L) */ {0x0001, 0x865b}, /* 1 Horizontal Offset for Valid Pixel(L) */ {0x0003, 0x865c}, /* 3 Vertical Offset for Valid Lines(L) */ {0x0058, 0x865d}, /* 58 Horizontal Valid Pixel Window(L) */ {0x0006, 0x8660}, /* Nibble data + input order */ {0x000a, 0x8602}, /* Optical black level set to 0x0a */ {0x0000, 0x8603}, /* Optical black level Offset */ /* {0x0000, 0x8611}, * 0 R Offset for white Balance */ /* {0x0000, 0x8612}, * 1 Gr Offset for white Balance */ /* {0x0000, 0x8613}, * 1f B Offset for white Balance */ /* {0x0000, 0x8614}, * f0 Gb Offset for white Balance */ {0x0040, 0x8651}, /* 2b BLUE gain for white balance good at all 60 */ {0x0030, 0x8652}, /* 41 Gr Gain for white Balance (L) */ {0x0035, 0x8653}, /* 26 RED gain for white balance */ {0x0035, 0x8654}, /* 40Gb Gain for white Balance (L) */ {0x0041, 0x863f}, /* Fixed Gamma correction enabled (makes colours look better) */ {0x0000, 0x8655}, /* High bits for white balance*****brightness control*** */ {} }; static const u16 spca508_sightcam_init_data[][2] = { /* This line seems to setup the frame/canvas */ {0x000f, 0x8402}, /* These 6 lines are needed to startup the webcam */ {0x0090, 0x8110}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0003, 0x8114}, {0x0080, 0x8804}, /* This part seems to make the pictures darker? (autobrightness?) */ {0x0001, 0x8801}, {0x0004, 0x8800}, {0x0003, 0x8801}, {0x00e0, 0x8800}, {0x0004, 0x8801}, {0x00b4, 0x8800}, {0x0005, 0x8801}, {0x0000, 0x8800}, {0x0006, 0x8801}, {0x00e0, 0x8800}, {0x0007, 0x8801}, {0x000c, 0x8800}, /* This section is just needed, it probably * does something like the previous section, * but the cam won't start if it's not included. */ {0x0014, 0x8801}, {0x0008, 0x8800}, {0x0015, 0x8801}, {0x0067, 0x8800}, {0x0016, 0x8801}, {0x0000, 0x8800}, {0x0017, 0x8801}, {0x0020, 0x8800}, {0x0018, 0x8801}, {0x0044, 0x8800}, /* Makes the picture darker - and the * cam won't start if not included */ {0x001e, 0x8801}, {0x00ea, 0x8800}, {0x001f, 0x8801}, {0x0001, 0x8800}, {0x0003, 0x8801}, {0x00e0, 0x8800}, /* seems to place the colors ontop of each other #1 */ {0x0006, 0x8704}, {0x0001, 0x870c}, {0x0016, 0x8600}, {0x0002, 0x8606}, /* if not included the pictures becomes _very_ dark */ {0x0064, 0x8607}, {0x003a, 0x8601}, {0x0000, 0x8602}, /* seems to place the colors ontop of each other #2 */ {0x0016, 0x8600}, {0x0018, 0x8617}, {0x0008, 0x8618}, {0x00a1, 0x8656}, /* webcam won't start if not included */ {0x0007, 0x865b}, {0x0001, 0x865c}, {0x0058, 0x865d}, {0x0048, 0x865e}, /* adjusts the colors */ {0x0049, 0x8651}, {0x0040, 0x8652}, {0x004c, 0x8653}, {0x0040, 0x8654}, {} }; static const u16 spca508_sightcam2_init_data[][2] = { {0x0020, 0x8112}, {0x000f, 0x8402}, {0x0000, 0x8403}, {0x0008, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x0009, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x000a, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x000b, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x000c, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x000d, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x000e, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x0007, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x000f, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x0018, 0x8660}, {0x0010, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x0011, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x0000, 0x86b0}, {0x0034, 0x86b1}, {0x0000, 0x86b2}, {0x0049, 0x86b3}, {0x0000, 0x86b4}, {0x0000, 0x86b4}, {0x0012, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x0013, 0x8201}, {0x0008, 0x8200}, {0x0001, 0x8200}, {0x0001, 0x86b0}, {0x00aa, 0x86b1}, {0x0000, 0x86b2}, {0x00e4, 0x86b3}, {0x0000, 0x86b4}, {0x0000, 0x86b4}, {0x0018, 0x8660}, {0x0090, 0x8110}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0003, 0x8114}, {0x0080, 0x8804}, {0x0003, 0x8801}, {0x0012, 0x8800}, {0x0004, 0x8801}, {0x0005, 0x8800}, {0x0005, 0x8801}, {0x0000, 0x8800}, {0x0006, 0x8801}, {0x0000, 0x8800}, {0x0007, 0x8801}, {0x0000, 0x8800}, {0x0008, 0x8801}, {0x0005, 0x8800}, {0x000a, 0x8700}, {0x000e, 0x8801}, {0x0004, 0x8800}, {0x0005, 0x8801}, {0x0047, 0x8800}, {0x0006, 0x8801}, {0x0000, 0x8800}, {0x0007, 0x8801}, {0x00c0, 0x8800}, {0x0008, 0x8801}, {0x0003, 0x8800}, {0x0013, 0x8801}, {0x0001, 0x8800}, {0x0009, 0x8801}, {0x0000, 0x8800}, {0x000a, 0x8801}, {0x0000, 0x8800}, {0x000b, 0x8801}, {0x0000, 0x8800}, {0x000c, 0x8801}, {0x0000, 0x8800}, {0x000e, 0x8801}, {0x0004, 0x8800}, {0x000f, 0x8801}, {0x0000, 0x8800}, {0x0010, 0x8801}, {0x0006, 0x8800}, {0x0011, 0x8801}, {0x0006, 0x8800}, {0x0012, 0x8801}, {0x0000, 0x8800}, {0x0013, 0x8801}, {0x0001, 0x8800}, {0x000a, 0x8700}, {0x0000, 0x8702}, {0x0000, 0x8703}, {0x00c2, 0x8704}, {0x0001, 0x870c}, {0x0044, 0x8600}, {0x0002, 0x8606}, {0x0064, 0x8607}, {0x003a, 0x8601}, {0x0008, 0x8602}, {0x0044, 0x8600}, {0x0018, 0x8617}, {0x0008, 0x8618}, {0x00a1, 0x8656}, {0x0004, 0x865b}, {0x0002, 0x865c}, {0x0058, 0x865d}, {0x0048, 0x865e}, {0x0012, 0x8608}, {0x002c, 0x8609}, {0x0002, 0x860a}, {0x002c, 0x860b}, {0x00db, 0x860c}, {0x00f9, 0x860d}, {0x00f1, 0x860e}, {0x00e3, 0x860f}, {0x002c, 0x8610}, {0x006c, 0x8651}, {0x0041, 0x8652}, {0x0059, 0x8653}, {0x0040, 0x8654}, {0x00fa, 0x8611}, {0x00ff, 0x8612}, {0x00f8, 0x8613}, {0x0000, 0x8614}, {0x0001, 0x863f}, {0x0000, 0x8640}, {0x0026, 0x8641}, {0x0045, 0x8642}, {0x0060, 0x8643}, {0x0075, 0x8644}, {0x0088, 0x8645}, {0x009b, 0x8646}, {0x00b0, 0x8647}, {0x00c5, 0x8648}, {0x00d2, 0x8649}, {0x00dc, 0x864a}, {0x00e5, 0x864b}, {0x00eb, 0x864c}, {0x00f0, 0x864d}, {0x00f6, 0x864e}, {0x00fa, 0x864f}, {0x00ff, 0x8650}, {0x0060, 0x8657}, {0x0010, 0x8658}, {0x0018, 0x8659}, {0x0005, 0x865a}, {0x0018, 0x8660}, {0x0003, 0x8509}, {0x0011, 0x850a}, {0x0032, 0x850b}, {0x0010, 0x850c}, {0x0021, 0x850d}, {0x0001, 0x8500}, {0x0000, 0x8508}, {0x0012, 0x8608}, {0x002c, 0x8609}, {0x0002, 0x860a}, {0x0039, 0x860b}, {0x00d0, 0x860c}, {0x00f7, 0x860d}, {0x00ed, 0x860e}, {0x00db, 0x860f}, {0x0039, 0x8610}, {0x0012, 0x8657}, {0x000c, 0x8619}, {0x0004, 0x861a}, {0x00a1, 0x8656}, {0x00c8, 0x8615}, {0x0032, 0x8616}, {0x0030, 0x8112}, {0x0020, 0x8112}, {0x0020, 0x8112}, {0x000f, 0x8402}, {0x0000, 0x8403}, {0x0090, 0x8110}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0003, 0x8114}, {0x0080, 0x8804}, {0x0003, 0x8801}, {0x0012, 0x8800}, {0x0004, 0x8801}, {0x0005, 0x8800}, {0x0005, 0x8801}, {0x0047, 0x8800}, {0x0006, 0x8801}, {0x0000, 0x8800}, {0x0007, 0x8801}, {0x00c0, 0x8800}, {0x0008, 0x8801}, {0x0003, 0x8800}, {0x000a, 0x8700}, {0x000e, 0x8801}, {0x0004, 0x8800}, {0x0005, 0x8801}, {0x0047, 0x8800}, {0x0006, 0x8801}, {0x0000, 0x8800}, {0x0007, 0x8801}, {0x00c0, 0x8800}, {0x0008, 0x8801}, {0x0003, 0x8800}, {0x0013, 0x8801}, {0x0001, 0x8800}, {0x0009, 0x8801}, {0x0000, 0x8800}, {0x000a, 0x8801}, {0x0000, 0x8800}, {0x000b, 0x8801}, {0x0000, 0x8800}, {0x000c, 0x8801}, {0x0000, 0x8800}, {0x000e, 0x8801}, {0x0004, 0x8800}, {0x000f, 0x8801}, {0x0000, 0x8800}, {0x0010, 0x8801}, {0x0006, 0x8800}, {0x0011, 0x8801}, {0x0006, 0x8800}, {0x0012, 0x8801}, {0x0000, 0x8800}, {0x0013, 0x8801}, {0x0001, 0x8800}, {0x000a, 0x8700}, {0x0000, 0x8702}, {0x0000, 0x8703}, {0x00c2, 0x8704}, {0x0001, 0x870c}, {0x0044, 0x8600}, {0x0002, 0x8606}, {0x0064, 0x8607}, {0x003a, 0x8601}, {0x0008, 0x8602}, {0x0044, 0x8600}, {0x0018, 0x8617}, {0x0008, 0x8618}, {0x00a1, 0x8656}, {0x0004, 0x865b}, {0x0002, 0x865c}, {0x0058, 0x865d}, {0x0048, 0x865e}, {0x0012, 0x8608}, {0x002c, 0x8609}, {0x0002, 0x860a}, {0x002c, 0x860b}, {0x00db, 0x860c}, {0x00f9, 0x860d}, {0x00f1, 0x860e}, {0x00e3, 0x860f}, {0x002c, 0x8610}, {0x006c, 0x8651}, {0x0041, 0x8652}, {0x0059, 0x8653}, {0x0040, 0x8654}, {0x00fa, 0x8611}, {0x00ff, 0x8612}, {0x00f8, 0x8613}, {0x0000, 0x8614}, {0x0001, 0x863f}, {0x0000, 0x8640}, {0x0026, 0x8641}, {0x0045, 0x8642}, {0x0060, 0x8643}, {0x0075, 0x8644}, {0x0088, 0x8645}, {0x009b, 0x8646}, {0x00b0, 0x8647}, {0x00c5, 0x8648}, {0x00d2, 0x8649}, {0x00dc, 0x864a}, {0x00e5, 0x864b}, {0x00eb, 0x864c}, {0x00f0, 0x864d}, {0x00f6, 0x864e}, {0x00fa, 0x864f}, {0x00ff, 0x8650}, {0x0060, 0x8657}, {0x0010, 0x8658}, {0x0018, 0x8659}, {0x0005, 0x865a}, {0x0018, 0x8660}, {0x0003, 0x8509}, {0x0011, 0x850a}, {0x0032, 0x850b}, {0x0010, 0x850c}, {0x0021, 0x850d}, {0x0001, 0x8500}, {0x0000, 0x8508}, {0x0012, 0x8608}, {0x002c, 0x8609}, {0x0002, 0x860a}, {0x0039, 0x860b}, {0x00d0, 0x860c}, {0x00f7, 0x860d}, {0x00ed, 0x860e}, {0x00db, 0x860f}, {0x0039, 0x8610}, {0x0012, 0x8657}, {0x0064, 0x8619}, /* This line starts it all, it is not needed here */ /* since it has been build into the driver */ /* jfm: don't start now */ /* {0x0030, 0x8112}, */ {} }; /* * Initialization data for Creative Webcam Vista */ static const u16 spca508_vista_init_data[][2] = { {0x0008, 0x8200}, /* Clear register */ {0x0000, 0x870b}, /* Reset CTL3 */ {0x0020, 0x8112}, /* Video Drop packet enable */ {0x0003, 0x8111}, /* Soft Reset compression, memory, TG & CDSP */ {0x0000, 0x8110}, /* Disable everything */ {0x0000, 0x8114}, /* Software GPIO output data */ {0x0000, 0x8114}, {0x0003, 0x8111}, {0x0000, 0x8111}, {0x0090, 0x8110}, /* Enable: SSI output, External 2X clock output */ {0x0020, 0x8112}, {0x0000, 0x8114}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0001, 0x8114}, {0x0003, 0x8114}, {0x000f, 0x8402}, /* Memory bank Address */ {0x0000, 0x8403}, /* Memory bank Address */ {0x00ba, 0x8804}, /* SSI Slave address */ {0x0010, 0x8802}, /* 93.75kHz SSI Clock Two DataByte */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, /* Will write 2 bytes (DATA1+DATA2) */ {0x0020, 0x8801}, /* Register address for SSI read/write */ {0x0044, 0x8805}, /* DATA2 */ {0x0004, 0x8800}, /* DATA1 -> write triggered */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0009, 0x8801}, {0x0042, 0x8805}, {0x0001, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x003c, 0x8801}, {0x0001, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0001, 0x8801}, {0x000a, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0002, 0x8801}, {0x0000, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0003, 0x8801}, {0x0027, 0x8805}, {0x0001, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0004, 0x8801}, {0x0065, 0x8805}, {0x0001, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0005, 0x8801}, {0x0003, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0006, 0x8801}, {0x001c, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0007, 0x8801}, {0x002a, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x000e, 0x8801}, {0x0000, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0028, 0x8801}, {0x002e, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0039, 0x8801}, {0x0013, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x003b, 0x8801}, {0x000c, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0035, 0x8801}, {0x0028, 0x8805}, {0x0000, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8803 } -> 0000: 00 */ /* READ { 0x0001, 0x8802 } -> 0000: 10 */ {0x0010, 0x8802}, {0x0009, 0x8801}, {0x0042, 0x8805}, {0x0001, 0x8800}, /* READ { 0x0001, 0x8803 } -> 0000: 00 */ {0x0050, 0x8703}, {0x0002, 0x8704}, /* External input CKIx1 */ {0x0001, 0x870c}, /* Select CKOx2 output */ {0x009a, 0x8600}, /* Line memory Read Counter (L) */ {0x0001, 0x8606}, /* 1 Line memory Read Counter (H) Result: (d)410 */ {0x0023, 0x8601}, {0x0010, 0x8602}, {0x000a, 0x8603}, {0x009a, 0x8600}, {0x0001, 0x865b}, /* 1 Horizontal Offset for Valid Pixel(L) */ {0x0003, 0x865c}, /* Vertical offset for valid lines (L) */ {0x0058, 0x865d}, /* Horizontal valid pixels window (L) */ {0x0048, 0x865e}, /* Vertical valid lines window (L) */ {0x0000, 0x865f}, {0x0006, 0x8660}, /* Enable nibble data input, select nibble input order */ {0x0013, 0x8608}, /* A11 Coeficients for color correction */ {0x0028, 0x8609}, /* Note: these values are confirmed at the end of array */ {0x0005, 0x860a}, /* ... */ {0x0025, 0x860b}, {0x00e1, 0x860c}, {0x00fa, 0x860d}, {0x00f4, 0x860e}, {0x00e8, 0x860f}, {0x0025, 0x8610}, /* A33 Coef. */ {0x00fc, 0x8611}, /* White balance offset: R */ {0x0001, 0x8612}, /* White balance offset: Gr */ {0x00fe, 0x8613}, /* White balance offset: B */ {0x0000, 0x8614}, /* White balance offset: Gb */ {0x0064, 0x8651}, /* R gain for white balance (L) */ {0x0040, 0x8652}, /* Gr gain for white balance (L) */ {0x0066, 0x8653}, /* B gain for white balance (L) */ {0x0040, 0x8654}, /* Gb gain for white balance (L) */ {0x0001, 0x863f}, /* Enable fixed gamma correction */ {0x00a1, 0x8656}, /* Size - Window1: 256x256, Window2: 128x128, * UV division: UV no change, * Enable New edge enhancement */ {0x0018, 0x8657}, /* Edge gain high threshold */ {0x0020, 0x8658}, /* Edge gain low threshold */ {0x000a, 0x8659}, /* Edge bandwidth high threshold */ {0x0005, 0x865a}, /* Edge bandwidth low threshold */ {0x0064, 0x8607}, /* UV filter enable */ {0x0016, 0x8660}, {0x0000, 0x86b0}, /* Bad pixels compensation address */ {0x00dc, 0x86b1}, /* X coord for bad pixels compensation (L) */ {0x0000, 0x86b2}, {0x0009, 0x86b3}, /* Y coord for bad pixels compensation (L) */ {0x0000, 0x86b4}, {0x0001, 0x86b0}, {0x00f5, 0x86b1}, {0x0000, 0x86b2}, {0x00c6, 0x86b3}, {0x0000, 0x86b4}, {0x0002, 0x86b0}, {0x001c, 0x86b1}, {0x0001, 0x86b2}, {0x00d7, 0x86b3}, {0x0000, 0x86b4}, {0x0003, 0x86b0}, {0x001c, 0x86b1}, {0x0001, 0x86b2}, {0x00d8, 0x86b3}, {0x0000, 0x86b4}, {0x0004, 0x86b0}, {0x001d, 0x86b1}, {0x0001, 0x86b2}, {0x00d8, 0x86b3}, {0x0000, 0x86b4}, {0x001e, 0x8660}, /* READ { 0x0000, 0x8608 } -> 0000: 13 */ /* READ { 0x0000, 0x8609 } -> 0000: 28 */ /* READ { 0x0000, 0x8610 } -> 0000: 05 */ /* READ { 0x0000, 0x8611 } -> 0000: 25 */ /* READ { 0x0000, 0x8612 } -> 0000: e1 */ /* READ { 0x0000, 0x8613 } -> 0000: fa */ /* READ { 0x0000, 0x8614 } -> 0000: f4 */ /* READ { 0x0000, 0x8615 } -> 0000: e8 */ /* READ { 0x0000, 0x8616 } -> 0000: 25 */ {} }; static int reg_write(struct gspca_dev *gspca_dev, u16 index, u16 value) { int ret; struct usb_device *dev = gspca_dev->dev; ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0, /* request */ USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, NULL, 0, 500); gspca_dbg(gspca_dev, D_USBO, "reg write i:0x%04x = 0x%02x\n", index, value); if (ret < 0) pr_err("reg write: error %d\n", ret); return ret; } /* read 1 byte */ /* returns: negative is error, pos or zero is data */ static int reg_read(struct gspca_dev *gspca_dev, u16 index) /* wIndex */ { int ret; ret = usb_control_msg(gspca_dev->dev, usb_rcvctrlpipe(gspca_dev->dev, 0), 0, /* register */ USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, /* value */ index, gspca_dev->usb_buf, 1, 500); /* timeout */ gspca_dbg(gspca_dev, D_USBI, "reg read i:%04x --> %02x\n", index, gspca_dev->usb_buf[0]); if (ret < 0) { pr_err("reg_read err %d\n", ret); return ret; } return gspca_dev->usb_buf[0]; } /* send 1 or 2 bytes to the sensor via the Synchronous Serial Interface */ static int ssi_w(struct gspca_dev *gspca_dev, u16 reg, u16 val) { int ret, retry; ret = reg_write(gspca_dev, 0x8802, reg >> 8); if (ret < 0) goto out; ret = reg_write(gspca_dev, 0x8801, reg & 0x00ff); if (ret < 0) goto out; if ((reg & 0xff00) == 0x1000) { /* if 2 bytes */ ret = reg_write(gspca_dev, 0x8805, val & 0x00ff); if (ret < 0) goto out; val >>= 8; } ret = reg_write(gspca_dev, 0x8800, val); if (ret < 0) goto out; /* poll until not busy */ retry = 10; for (;;) { ret = reg_read(gspca_dev, 0x8803); if (ret < 0) break; if (gspca_dev->usb_buf[0] == 0) break; if (--retry <= 0) { gspca_err(gspca_dev, "ssi_w busy %02x\n", gspca_dev->usb_buf[0]); ret = -1; break; } msleep(8); } out: return ret; } static int write_vector(struct gspca_dev *gspca_dev, const u16 (*data)[2]) { int ret = 0; while ((*data)[1] != 0) { if ((*data)[1] & 0x8000) { if ((*data)[1] == 0xdd00) /* delay */ msleep((*data)[0]); else ret = reg_write(gspca_dev, (*data)[1], (*data)[0]); } else { ret = ssi_w(gspca_dev, (*data)[1], (*data)[0]); } if (ret < 0) break; data++; } return ret; } /* this function is called at probe time */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct sd *sd = (struct sd *) gspca_dev; struct cam *cam; const u16 (*init_data)[2]; static const u16 (*(init_data_tb[]))[2] = { spca508_vista_init_data, /* CreativeVista 0 */ spca508_sightcam_init_data, /* HamaUSBSightcam 1 */ spca508_sightcam2_init_data, /* HamaUSBSightcam2 2 */ spca508cs110_init_data, /* IntelEasyPCCamera 3 */ spca508cs110_init_data, /* MicroInnovationIC200 4 */ spca508_init_data, /* ViewQuestVQ110 5 */ }; int data1, data2; /* Read from global register the USB product and vendor IDs, just to * prove that we can communicate with the device. This works, which * confirms at we are communicating properly and that the device * is a 508. */ data1 = reg_read(gspca_dev, 0x8104); data2 = reg_read(gspca_dev, 0x8105); gspca_dbg(gspca_dev, D_PROBE, "Webcam Vendor ID: 0x%02x%02x\n", data2, data1); data1 = reg_read(gspca_dev, 0x8106); data2 = reg_read(gspca_dev, 0x8107); gspca_dbg(gspca_dev, D_PROBE, "Webcam Product ID: 0x%02x%02x\n", data2, data1); data1 = reg_read(gspca_dev, 0x8621); gspca_dbg(gspca_dev, D_PROBE, "Window 1 average luminance: %d\n", data1); cam = &gspca_dev->cam; cam->cam_mode = sif_mode; cam->nmodes = ARRAY_SIZE(sif_mode); sd->subtype = id->driver_info; init_data = init_data_tb[sd->subtype]; return write_vector(gspca_dev, init_data); } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { return 0; } static int sd_start(struct gspca_dev *gspca_dev) { int mode; mode = gspca_dev->cam.cam_mode[gspca_dev->curr_mode].priv; reg_write(gspca_dev, 0x8500, mode); switch (mode) { case 0: case 1: reg_write(gspca_dev, 0x8700, 0x28); /* clock */ break; default: /* case 2: */ /* case 3: */ reg_write(gspca_dev, 0x8700, 0x23); /* clock */ break; } reg_write(gspca_dev, 0x8112, 0x10 | 0x20); return 0; } static void sd_stopN(struct gspca_dev *gspca_dev) { /* Video ISO disable, Video Drop Packet enable: */ reg_write(gspca_dev, 0x8112, 0x20); } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { switch (data[0]) { case 0: /* start of frame */ gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); data += SPCA508_OFFSET_DATA; len -= SPCA508_OFFSET_DATA; gspca_frame_add(gspca_dev, FIRST_PACKET, data, len); break; case 0xff: /* drop */ break; default: data += 1; len -= 1; gspca_frame_add(gspca_dev, INTER_PACKET, data, len); break; } } static void setbrightness(struct gspca_dev *gspca_dev, s32 brightness) { /* MX seem contrast */ reg_write(gspca_dev, 0x8651, brightness); reg_write(gspca_dev, 0x8652, brightness); reg_write(gspca_dev, 0x8653, brightness); reg_write(gspca_dev, 0x8654, brightness); } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: setbrightness(gspca_dev, ctrl->val); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 5); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BRIGHTNESS, 0, 255, 1, 128); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x0130, 0x0130), .driver_info = HamaUSBSightcam}, {USB_DEVICE(0x041e, 0x4018), .driver_info = CreativeVista}, {USB_DEVICE(0x0733, 0x0110), .driver_info = ViewQuestVQ110}, {USB_DEVICE(0x0af9, 0x0010), .driver_info = HamaUSBSightcam}, {USB_DEVICE(0x0af9, 0x0011), .driver_info = HamaUSBSightcam2}, {USB_DEVICE(0x8086, 0x0110), .driver_info = IntelEasyPCCamera}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver);
2 2 15 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* Socket buffer accounting * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" #define select_skb_count(skb) (&rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. */ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_read(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } } /* * Note the addition of a ref on a socket buffer. */ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(&rxrpc_n_rx_skbs); trace_rxrpc_skb(skb, 0, n, why); } /* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); consume_skb(skb); } } /* * Clear a queue of socket buffers. */ void rxrpc_purge_queue(struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); consume_skb(skb); } }
155 159 159 155 101 101 101 101 9 7 7 1 3 2 6 2 3 9 9 3 21 22 2 15 15 13 2 2 3 20 20 3 3 20 2 16 3 3 3 50 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/mount.c - kernfs mount implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/seq_file.h> #include <linux/exportfs.h> #include <linux/uuid.h> #include <linux/statfs.h> #include "kernfs-internal.h" struct kmem_cache *kernfs_node_cache __ro_after_init; struct kmem_cache *kernfs_iattrs_cache __ro_after_init; struct kernfs_global_locks *kernfs_locks __ro_after_init; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) return scops->show_options(sf, root); return 0; } static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_path) return scops->show_path(sf, node, root); seq_dentry(sf, dentry, " \t\n\\"); return 0; } static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) { simple_statfs(dentry, buf); buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, }; static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct kernfs_node *kn = inode->i_private; if (*max_len < 2) { *max_len = 2; return FILEID_INVALID; } *max_len = 2; *(u64 *)fh = kn->id; return FILEID_KERNFS; } static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, bool get_parent) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_node *kn; struct inode *inode; u64 id; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_KERNFS: id = *(u64 *)fid; break; case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: /* * blk_log_action() exposes "LOW32,HIGH32" pair without * type and userland can call us with generic fid * constructed from them. Combine it back to ID. See * blk_log_action(). */ id = ((u64)fid->i32.gen << 32) | fid->i32.ino; break; default: return NULL; } kn = kernfs_find_and_get_node_by_id(info->root, id); if (!kn) return ERR_PTR(-ESTALE); if (get_parent) { struct kernfs_node *parent; parent = kernfs_get_parent(kn); kernfs_put(kn); kn = parent; if (!kn) return ERR_PTR(-ESTALE); } inode = kernfs_get_inode(sb, kn); kernfs_put(kn); return d_obtain_alias(inode); } static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); } static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); } static struct dentry *kernfs_get_parent_dentry(struct dentry *child) { struct kernfs_node *kn = kernfs_dentry_node(child); return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); } static const struct export_operations kernfs_export_ops = { .encode_fh = kernfs_encode_fh, .fh_to_dentry = kernfs_fh_to_dentry, .fh_to_parent = kernfs_fh_to_parent, .get_parent = kernfs_get_parent_dentry, }; /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question * * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one, * %NULL is returned. */ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { if (sb->s_op == &kernfs_sops) return kernfs_info(sb)->root; return NULL; } /* * find the next ancestor in the path down to @child, where @parent was the * ancestor whose descendant we want to find. * * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root * node. If @parent is b, then we return the node for c. * Passing in d as @parent is not ok. */ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, struct kernfs_node *parent) { if (child == parent) { pr_crit_once("BUG in find_next_ancestor: called with parent == child"); return NULL; } while (child->parent != parent) { if (!child->parent) return NULL; child = child->parent; } return child; } /** * kernfs_node_dentry - get a dentry for the given kernfs_node * @kn: kernfs_node for which a dentry is needed * @sb: the kernfs super_block * * Return: the dentry pointer */ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb) { struct dentry *dentry; struct kernfs_node *knparent; BUG_ON(sb->s_op != &kernfs_sops); dentry = dget(sb->s_root); /* Check if this is the root kernfs_node */ if (!kn->parent) return dentry; knparent = find_next_ancestor(kn, NULL); if (WARN_ON(!knparent)) { dput(dentry); return ERR_PTR(-EINVAL); } do { struct dentry *dtmp; struct kernfs_node *kntmp; if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); if (WARN_ON(!kntmp)) { dput(dentry); return ERR_PTR(-EINVAL); } dtmp = lookup_positive_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) return dtmp; knparent = kntmp; dentry = dtmp; } while (true); } static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *kf_root = kfc->root; struct inode *inode; struct dentry *root; info->sb = sb; /* Userspace would break if executables or devices appear on sysfs */ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; /* sysfs dentries and inodes don't require IO to create */ sb->s_shrink->seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); up_read(&kf_root->kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; } /* instantiate and link root dentry */ root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } sb->s_root = root; sb->s_d_op = &kernfs_dops; return 0; } static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; kfc->ns_tag = NULL; return set_anon_super_fc(sb, fc); } /** * kernfs_super_ns - determine the namespace tag of a kernfs super_block * @sb: super_block of interest * * Return: the namespace tag associated with kernfs super_block @sb. */ const void *kernfs_super_ns(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); return info->ns; } /** * kernfs_get_tree - kernfs filesystem access/retrieval helper * @fc: The filesystem context. * * This is to be called from each kernfs user's fs_context->ops->get_tree() * implementation, which should set the specified ->@fs_type and ->@flags, and * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, * respectively. * * Return: %0 on success, -errno on failure. */ int kernfs_get_tree(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; info->root = kfc->root; info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); fc->s_fs_info = info; sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = kfc->root; kfc->new_sb_created = true; error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); return error; } sb->s_flags |= SB_ACTIVE; uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); up_write(&root->kernfs_supers_rwsem); } fc->root = dget(sb->s_root); return 0; } void kernfs_free_fs_context(struct fs_context *fc) { /* Note that we don't deal with kfc->ns_tag here. */ kfree(fc->s_fs_info); fc->s_fs_info = NULL; } /** * kernfs_kill_sb - kill_sb for kernfs * @sb: super_block being killed * * This can be used directly for file_system_type->kill_sb(). If a kernfs * user needs extra cleanup, it can implement its own kill_sb() and call * this function at the end. */ void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); struct kernfs_root *root = info->root; down_write(&root->kernfs_supers_rwsem); list_del(&info->node); up_write(&root->kernfs_supers_rwsem); /* * Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing kernfs_super_info. */ kill_anon_super(sb); kfree(info); } static void __init kernfs_mutex_init(void) { int count; for (count = 0; count < NR_KERNFS_LOCKS; count++) mutex_init(&kernfs_locks->open_file_mutex[count]); } static void __init kernfs_lock_init(void) { kernfs_locks = kmalloc(sizeof(struct kernfs_global_locks), GFP_KERNEL); WARN_ON(!kernfs_locks); kernfs_mutex_init(); } void __init kernfs_init(void) { kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), 0, SLAB_PANIC, NULL); /* Creates slab cache for kernfs inode attributes */ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", sizeof(struct kernfs_iattrs), 0, SLAB_PANIC, NULL); kernfs_lock_init(); }
255 43 247 140 140 119 119 119 94 22 3 23 1 8 1 24 268 155 270 137 100 234 80 4 14 6 16 2 3 5 5 9 10 289 310 6 306 5 11 4 7 21 296 280 19 28 13 18 4 26 7 284 221 19 271 2 89 1 207 77 275 273 76 217 24 273 144 121 4 141 36 2 7 105 3 105 35 137 3 115 23 135 3 29 89 119 21 4 119 98 49 6 96 135 1 482 5 72 406 7 7 392 1 6 387 390 274 391 268 105 99 18 352 49 135 277 110 250 360 362 361 351 136 94 40 1 131 248 392 354 12 4 95 7 97 54 47 208 3 2 26 9 90 2 1 97 97 93 86 7 2 9 79 3 76 6 87 118 116 94 4 125 3 112 95 11 199 12 77 145 143 24 6 11 24 8 6 10 2 9 10 2 7 27 3 23 24 69 1 7 39 26 3 5 34 21 62 230 2 1 8 5 221 218 36 18 5 50 42 5 2 2 2 25 1 1 4 2 13 6 15 8 13 9 12 1 15 12 5 17 8 14 40 4 2 32 22 5 14 13 50 1 1 5 3 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 // SPDX-License-Identifier: GPL-2.0-only /* * "splice": joining two ropes together by interweaving their strands. * * This is the "extended pipe" functionality, where a pipe is used as * an arbitrary in-memory buffer. Think of a pipe as a small kernel * buffer that you can use to transfer data from one end to the other. * * The traditional unix read/write is extended with a "splice()" operation * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sched/signal.h> #include "internal.h" /* * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to * indicate they support non-blocking reads or writes, we must clear it * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ static noinline void noinline pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); do { if (!(fmode & FMODE_NOWAIT)) break; } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT)); } /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); struct address_space *mapping; folio_lock(folio); mapping = folio_mapping(folio); if (mapping) { WARN_ON(!folio_test_uptodate(folio)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this folio, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * folio, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ folio_wait_writeback(folio); if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, folio)) { buf->flags |= PIPE_BUF_FLAG_LRU; return true; } } /* * Raced with truncate or failed to remove folio from current * address space, unlock and return failure. */ out_unlock: folio_unlock(folio); return false; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); int err; if (!folio_test_uptodate(folio)) { folio_lock(folio); /* * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ if (!folio->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } /* Folio is ok after all, we are done */ folio_unlock(folio); } return 0; error: folio_unlock(folio); return err; } const struct pipe_buf_operations page_cache_pipe_buf_ops = { .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .try_steal = page_cache_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return false; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_try_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { .release = page_cache_pipe_buf_release, .try_steal = user_page_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->rd_wait)) wake_up_interruptible(&pipe->rd_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; unsigned int mask = pipe->ring_size - 1; ssize_t ret = 0; int page_nr = 0; if (!spd_pages) return 0; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } while (!pipe_full(head, tail, pipe->max_usage)) { struct pipe_buffer *buf = &pipe->bufs[head & mask]; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; buf->flags = 0; head++; pipe->head = head; page_nr++; ret += buf->len; if (!--spd->nr_pages) break; } if (!ret) ret = -EAGAIN; out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; } EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { pipe->bufs[head & mask] = *buf; pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); return ret; } EXPORT_SYMBOL(add_to_pipe); /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int max_usage = READ_ONCE(pipe->max_usage); spd->nr_pages_max = max_usage; if (max_usage <= PIPE_DEF_BUFFERS) return 0; spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; } void splice_shrink_spd(struct splice_pipe_desc *spd) { if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); kfree(spd->partial); } /** * copy_splice_read - Copy data from a file and splice the copy into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function allocates a bunch of pages sufficient to hold the requested * amount of data (but limited by the remaining pipe capacity), passes it to * the file's ->read_iter() to read into and then splices the used pages into * the pipe. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct iov_iter to; struct bio_vec *bv; struct kiocb kiocb; struct page **pages; ssize_t ret; size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ used = pipe_occupancy(pipe->head, pipe->tail); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); bv = kzalloc(array_size(npages, sizeof(bv[0])) + array_size(npages, sizeof(struct page *)), GFP_KERNEL); if (!bv) return -ENOMEM; pages = (struct page **)(bv + npages); npages = alloc_pages_bulk_array(GFP_USER, npages, pages); if (!npages) { kfree(bv); return -ENOMEM; } remain = len = min_t(size_t, len, npages * PAGE_SIZE); for (i = 0; i < npages; i++) { chunk = min_t(size_t, PAGE_SIZE, remain); bv[i].bv_page = pages[i]; bv[i].bv_offset = 0; bv[i].bv_len = chunk; remain -= chunk; } /* Do the I/O */ iov_iter_bvec(&to, ITER_DEST, bv, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; } /* * Callers of ->splice_read() expect -EAGAIN on "can't put anything in * there", rather than -EFAULT. */ if (ret == -EFAULT) ret = -EAGAIN; /* Free any pages that didn't get touched at all. */ if (keep < npages) release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ remain = ret; for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE); *buf = (struct pipe_buffer) { .ops = &default_pipe_buf_ops, .page = bv[i].bv_page, .offset = 0, .len = chunk, }; pipe->head++; remain -= chunk; } kfree(bv); return ret; } EXPORT_SYMBOL(copy_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, .try_steal = generic_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .release = generic_pipe_buf_release, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wr_wait)) wake_up_interruptible(&pipe->wr_wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } ret = actor(pipe, buf, sd); if (ret <= 0) return ret; buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } if (!sd->total_len) return 0; } return 1; } /* We know we have a pipe buffer, but maybe it's empty? */ static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); pipe->tail = tail+1; return true; } return false; } /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wait for some data and return a positive * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { /* * Check for signal early to make process killable when there are * always buffers available */ if (signal_pending(current)) return -ERESTARTSYS; repeat: while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; if (sd->need_wakeup) { wakeup_pipe_writers(pipe); sd->need_wakeup = false; } pipe_wait_readable(pipe); } if (eat_empty_buffer(pipe)) goto repeat; return 1; } /** * splice_from_pipe_begin - start splicing from pipe * @sd: information about the splice operation * * Description: * This function should be called before a loop containing * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } /** * splice_from_pipe_end - finish splicing from pipe * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wake up pipe writers if necessary. It should * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } /** * __splice_from_pipe - splice data from a pipe to given actor * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to * the desired destination. See pipe_to_file, pipe_to_sendmsg, or * pipe_to_user. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; splice_from_pipe_begin(sd); do { cond_resched(); ret = splice_from_pipe_next(pipe, sd); if (ret > 0) ret = splice_from_pipe_feed(pipe, sd, actor); } while (ret > 0); splice_from_pipe_end(pipe, sd); return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); /** * splice_from_pipe - splice data from a pipe to a file * @pipe: pipe to splice from * @out: file to splice to * @ppos: position in @out * @len: how many bytes to splice * @flags: splice modifier flags * @actor: handler that splices the data * * Description: * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); return ret; } /** * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. * This one is ->write_iter-based. * */ ssize_t iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; int nbufs = pipe->max_usage; struct bio_vec *array; ssize_t ret; if (!out->f_op->write_iter) return -EINVAL; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (unlikely(!array)) return -ENOMEM; pipe_lock(pipe); splice_from_pipe_begin(&sd); while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; unsigned int head, tail, mask; size_t left; int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; if (unlikely(nbufs < pipe->max_usage)) { kfree(array); nbufs = pipe->max_usage; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (!array) { ret = -ENOMEM; break; } } head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ if (!this_len) continue; this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; goto done; } bvec_set_page(&array[n], buf->page, this_len, buf->offset); left -= this_len; n++; } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); init_sync_kiocb(&kiocb, out); kiocb.ki_pos = sd.pos; ret = out->f_op->write_iter(&kiocb, &from); sd.pos = kiocb.ki_pos; if (ret <= 0) break; sd.num_spliced += ret; sd.total_len -= ret; *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { buf->offset += ret; buf->len -= ret; ret = 0; } } } done: kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; return ret; } EXPORT_SYMBOL(iter_file_splice_write); #ifdef CONFIG_NET /** * splice_to_socket - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will send @len bytes from the pipe to a network socket. No data copying * is involved. * */ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct socket *sock = sock_from_file(out); struct bio_vec bvec[16]; struct msghdr msg = {}; ssize_t ret = 0; size_t spliced = 0; bool need_wakeup = false; pipe_lock(pipe); while (len > 0) { unsigned int head, tail, mask, bc = 0; size_t remain = len; /* * Check for signal early to make process killable when there * are always buffers available */ ret = -ERESTARTSYS; if (signal_pending(current)) break; while (pipe_empty(pipe->head, pipe->tail)) { ret = 0; if (!pipe->writers) goto out; if (spliced) goto out; ret = -EAGAIN; if (flags & SPLICE_F_NONBLOCK) goto out; ret = -ERESTARTSYS; if (signal_pending(current)) goto out; if (need_wakeup) { wakeup_pipe_writers(pipe); need_wakeup = false; } pipe_wait_readable(pipe); } head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t seg; if (!buf->len) { tail++; continue; } seg = min_t(size_t, remain, buf->len); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; break; } bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); remain -= seg; if (remain == 0 || bc >= ARRAY_SIZE(bvec)) break; tail++; } if (!bc) break; msg.msg_flags = MSG_SPLICE_PAGES; if (flags & SPLICE_F_MORE) msg.msg_flags |= MSG_MORE; if (remain && pipe_occupancy(pipe->head, tail) > 0) msg.msg_flags |= MSG_MORE; if (out->f_flags & O_NONBLOCK) msg.msg_flags |= MSG_DONTWAIT; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, len - remain); ret = sock_sendmsg(sock, &msg); if (ret <= 0) break; spliced += ret; len -= ret; tail = pipe->tail; while (ret > 0) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; buf->len -= seg; ret -= seg; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; } } if (tail != pipe->tail) { pipe->tail = tail; if (pipe->files) need_wakeup = true; } } out: pipe_unlock(pipe); if (need_wakeup) wakeup_pipe_writers(pipe); return spliced ?: ret; } #endif static int warn_unsupported(struct file *file, const char *op) { pr_debug_ratelimited( "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } /* * Attempt to initiate a splice from pipe to file. */ static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Indicate to the caller that there was a premature EOF when reading from the * source and the caller didn't indicate they would be sending more data after * this. */ static void do_splice_eof(struct splice_desc *sd) { if (sd->splice_eof) sd->splice_eof(sd); } /* * Callers already called rw_verify_area() on the entire range. * No need to call it for sub ranges. */ static ssize_t do_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int p_space; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; if (!len) return 0; /* Don't try to read more the pipe has space for. */ p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); /* * O_DIRECT and DAX don't deal with the pagecache, so we allocate a * buffer, copy into it and splice that into the pipe. */ if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } /** * vfs_splice_read - Read data from a file and splice it into a pipe * @in: File to splice from * @ppos: Input file offset * @pipe: Pipe to splice to * @len: Number of bytes to splice * @flags: Splice modifier flags (SPLICE_F_*) * * Splice the requested amount of data from the input file to the pipe. This * is synchronous as the caller must hold the pipe lock across the entire * operation. * * If successful, it returns the amount of data spliced, 0 if it hit the EOF or * a hole and a negative error code otherwise. */ ssize_t vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t ret; ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; return do_splice_read(in, ppos, pipe, len, flags); } EXPORT_SYMBOL_GPL(vfs_splice_read); /** * splice_direct_to_actor - splices data directly between two non-pipes * @in: file to splice from * @sd: actor information on where to splice to * @actor: handles the data splicing * * Description: * This is a special case helper to splice directly between two * points, without requiring an explicit pipe. Internally an allocated * pipe is cached in the process, and reused during the lifetime of * that process. * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; ssize_t ret, bytes; size_t len; int i, flags, more; /* * We require the input to be seekable, as we don't want to randomly * drop data for eg socket -> socket splicing. Use the piped splicing * for that! */ if (unlikely(!(in->f_mode & FMODE_LSEEK))) return -EINVAL; /* * neither in nor out is a pipe, setup an internal pipe attached to * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; if (unlikely(!pipe)) { pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; current->splice_pipe = pipe; } /* * Do the splice. */ bytes = 0; len = sd->total_len; /* Don't block on output, we have to drain the direct pipe. */ flags = sd->flags; sd->flags &= ~SPLICE_F_NONBLOCK; /* * We signal MORE until we've read sufficient data to fulfill the * request and we keep signalling it if the caller set it. */ more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); while (len) { size_t read_len; loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; read_len = ret; sd->total_len = read_len; /* * If we now have sufficient data to fulfill the request then * we clear SPLICE_F_MORE if it was not set initially. */ if (read_len >= len && !more) sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); if (unlikely(ret <= 0)) { sd->pos = prev_pos; goto out_release; } bytes += ret; len -= ret; sd->pos = pos; if (ret < read_len) { sd->pos = prev_pos + ret; goto out_release; } } done: pipe->tail = pipe->head = 0; file_accessed(in); return bytes; read_failure: /* * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a * "->splice_in()" that returned EOF (ie zero) *and* we have sent at * least 1 byte *then* we will also do the ->splice_eof() call. */ if (ret == 0 && !more && len > 0 && bytes) do_splice_eof(sd); out_release: /* * If we did an incomplete transfer we must release * the pipe buffers in question: */ for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); } if (!bytes) bytes = ret; goto done; } EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; long ret; file_start_write(file); ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); file_end_write(file); return ret; } static int splice_file_range_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) { struct file *file = sd->u.file; if (file->f_op->splice_eof) file->f_op->splice_eof(file); } static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags, splice_direct_actor *actor) { struct splice_desc sd = { .len = len, .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, .splice_eof = direct_file_splice_eof, .opos = opos, }; ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } /** * do_splice_direct - splices data directly between two files * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * For use by do_sendfile(). splice can easily emulate sendfile, but * doing it in the application would incur an extra system call * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * * Callers already called rw_verify_area() on the entire range. */ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { return do_splice_direct_actor(in, ppos, out, opos, len, flags, direct_splice_actor); } EXPORT_SYMBOL(do_splice_direct); /** * splice_file_range - splices data between two files for copy_file_range() * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * * Description: * For use by ->copy_file_range() methods. * Like do_splice_direct(), but vfs_copy_file_range() already holds * start_file_write() on @out file. * * Callers already called rw_verify_area() on the entire range. */ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len) { lockdep_assert(file_write_started(out)); return do_splice_direct_actor(in, ppos, out, opos, min_t(size_t, len, MAX_RW_COUNT), 0, splice_file_range_actor); } EXPORT_SYMBOL(splice_file_range); static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; pipe_wait_writable(pipe); } } static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags) { ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * Determine where to splice to/from. */ ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) return -ESPIPE; /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; offset = *off_out; } else { offset = out->f_pos; } if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = rw_verify_area(WRITE, out, &offset, len); if (unlikely(ret < 0)) return ret; if (in->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; file_start_write(out); ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); if (!off_out) out->f_pos = offset; else *off_out = offset; } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; offset = *off_in; } else { offset = in->f_pos; } ret = rw_verify_area(READ, in, &offset, len); if (unlikely(ret < 0)) return ret; if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else *off_in = offset; } else { ret = -EINVAL; } if (ret > 0) { /* * Generate modify out before access in: * do_splice_from() may've already sent modify out, * and this ensures the events get merged. */ fsnotify_modify(out); fsnotify_access(in); } return ret; } static ssize_t __do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe) { if (off_in) return -ESPIPE; pipe_clear_nowait(in); } if (opipe) { if (off_out) return -ESPIPE; pipe_clear_nowait(out); } if (off_out) { if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; __off_out = &offset; } if (off_in) { if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; __off_in = &offset; } ret = do_splice(in, __off_in, out, __off_out, len, flags); if (ret < 0) return ret; if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) return -EFAULT; if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) return -EFAULT; return ret; } static ssize_t iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; ssize_t left; size_t start; int i, n; left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); if (left <= 0) { ret = left; break; } n = DIV_ROUND_UP(left + start, PAGE_SIZE); for (i = 0; i < n; i++) { int size = min_t(int, left, PAGE_SIZE - start); buf.page = pages[i]; buf.offset = start; buf.len = size; ret = add_to_pipe(pipe, &buf); if (unlikely(ret < 0)) { iov_iter_revert(from, left); // this one got dropped by add_to_pipe() while (++i < n) put_page(pages[i]); goto out; } total += ret; left -= size; start = 0; } } out: return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, .u.data = iter }; ssize_t ret = 0; if (!pipe) return -EBADF; pipe_clear_nowait(file); if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } if (ret > 0) fsnotify_access(file); return ret; } /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe; ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; pipe_clear_nowait(file); pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); if (ret > 0) { wakeup_pipe_readers(pipe); fsnotify_modify(file); } return ret; } /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple * operation that can be supported without any funky alignment restrictions * or nasty vm tricks. We simply map in the user memory and fill them into * a pipe. The reverse isn't quite as easy, though. There are two possible * solutions for that: * * - memcpy() the data internally, at which point we might as well just * do a regular read() on the buffer anyway. * - Lots of nasty vm tricks, that are neither fast nor flexible (it * has restriction limitations on both ends of the pipe). * * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_WRITE) type = ITER_SOURCE; else if (fd_file(f)->f_mode & FMODE_READ) type = ITER_DEST; else return -EBADF; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) return error; if (!iov_iter_count(&iter)) error = 0; else if (type == ITER_SOURCE) error = vmsplice_to_pipe(fd_file(f), &iter, flags); else error = vmsplice_to_user(fd_file(f), &iter, flags); kfree(iov); return error; } SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; CLASS(fd, in)(fd_in); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fd_out); if (fd_empty(out)) return -EBADF; return __do_splice(fd_file(in), off_in, fd_file(out), off_out, len, flags); } /* * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_empty(pipe->head, pipe->tail)) return 0; ret = 0; pipe_lock(pipe); while (pipe_empty(pipe->head, pipe->tail)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!pipe->writers) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } pipe_wait_readable(pipe); } pipe_unlock(pipe); return ret; } /* * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; ret = 0; pipe_lock(pipe); while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; break; } if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } pipe_wait_writable(pipe); } pipe_unlock(pipe); return ret; } /* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; retry: ret = ipipe_prep(ipipe, flags); if (ret) return ret; ret = opipe_prep(opipe, flags); if (ret) return ret; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; o_head = opipe->head; o_mask = opipe->ring_size - 1; do { size_t o_len; if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } /* * We raced with another reader/writer and haven't * managed to process any buffers. A zero return * value means EOF, so retry instead. */ pipe_unlock(ipipe); pipe_unlock(opipe); goto retry; } ibuf = &ipipe->bufs[i_tail & i_mask]; obuf = &opipe->bufs[o_head & o_mask]; if (len >= ibuf->len) { /* * Simply move the whole buffer from ipipe to opipe */ *obuf = *ibuf; ibuf->ops = NULL; i_tail++; ipipe->tail = i_tail; input_wakeup = true; o_len = obuf->len; o_head++; opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flags, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; obuf->len = len; ibuf->offset += len; ibuf->len -= len; o_len = len; o_head++; opipe->head = o_head; } ret += o_len; len -= o_len; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); if (input_wakeup) wakeup_pipe_writers(ipipe); return ret; } /* * Link contents of ipipe to opipe. */ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; o_head = opipe->head; o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; /* * If we have iterated all input buffers or run out of * output room, break. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) break; ibuf = &ipipe->bufs[i_tail & i_mask]; obuf = &opipe->bufs[o_head & o_mask]; /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flag, we need to prevent * multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; if (obuf->len > len) obuf->len = len; ret += obuf->len; len -= obuf->len; o_head++; opipe->head = o_head; i_tail++; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * This is a tee(1) implementation that works on pipes. It doesn't copy * any data, it simply references the 'in' pages on the 'out' pipe. * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ ssize_t do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; /* * Duplicate the contents of ipipe to opipe without actually * copying the data. */ if (ipipe && opipe && ipipe != opipe) { if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ ret = ipipe_prep(ipipe, flags); if (!ret) { ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } } if (ret > 0) { fsnotify_access(in); fsnotify_modify(out); } return ret; } SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; CLASS(fd, in)(fdin); if (fd_empty(in)) return -EBADF; CLASS(fd, out)(fdout); if (fd_empty(out)) return -EBADF; return do_tee(fd_file(in), fd_file(out), len, flags); }
12 12 12 12 2 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 // SPDX-License-Identifier: GPL-2.0-or-later /* * Randomness driver for virtio * Copyright (C) 2007, 2008 Rusty Russell IBM Corporation */ #include <asm/barrier.h> #include <linux/err.h> #include <linux/hw_random.h> #include <linux/scatterlist.h> #include <linux/spinlock.h> #include <linux/virtio.h> #include <linux/virtio_rng.h> #include <linux/module.h> #include <linux/slab.h> static DEFINE_IDA(rng_index_ida); struct virtrng_info { struct hwrng hwrng; struct virtqueue *vq; char name[25]; int index; bool hwrng_register_done; bool hwrng_removed; /* data transfer */ struct completion have_data; unsigned int data_avail; unsigned int data_idx; /* minimal size returned by rng_buffer_size() */ #if SMP_CACHE_BYTES < 32 u8 data[32]; #else u8 data[SMP_CACHE_BYTES]; #endif }; static void random_recv_done(struct virtqueue *vq) { struct virtrng_info *vi = vq->vdev->priv; unsigned int len; /* We can get spurious callbacks, e.g. shared IRQs + virtio_pci. */ if (!virtqueue_get_buf(vi->vq, &len)) return; smp_store_release(&vi->data_avail, len); complete(&vi->have_data); } static void request_entropy(struct virtrng_info *vi) { struct scatterlist sg; reinit_completion(&vi->have_data); vi->data_idx = 0; sg_init_one(&sg, vi->data, sizeof(vi->data)); /* There should always be room for one buffer. */ virtqueue_add_inbuf(vi->vq, &sg, 1, vi->data, GFP_KERNEL); virtqueue_kick(vi->vq); } static unsigned int copy_data(struct virtrng_info *vi, void *buf, unsigned int size) { size = min_t(unsigned int, size, vi->data_avail); memcpy(buf, vi->data + vi->data_idx, size); vi->data_idx += size; vi->data_avail -= size; if (vi->data_avail == 0) request_entropy(vi); return size; } static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait) { int ret; struct virtrng_info *vi = (struct virtrng_info *)rng->priv; unsigned int chunk; size_t read; if (vi->hwrng_removed) return -ENODEV; read = 0; /* copy available data */ if (smp_load_acquire(&vi->data_avail)) { chunk = copy_data(vi, buf, size); size -= chunk; read += chunk; } if (!wait) return read; /* We have already copied available entropy, * so either size is 0 or data_avail is 0 */ while (size != 0) { /* data_avail is 0 but a request is pending */ ret = wait_for_completion_killable(&vi->have_data); if (ret < 0) return ret; /* if vi->data_avail is 0, we have been interrupted * by a cleanup, but buffer stays in the queue */ if (vi->data_avail == 0) return read; chunk = copy_data(vi, buf + read, size); size -= chunk; read += chunk; } return read; } static void virtio_cleanup(struct hwrng *rng) { struct virtrng_info *vi = (struct virtrng_info *)rng->priv; complete(&vi->have_data); } static int probe_common(struct virtio_device *vdev) { int err, index; struct virtrng_info *vi = NULL; vi = kzalloc(sizeof(struct virtrng_info), GFP_KERNEL); if (!vi) return -ENOMEM; vi->index = index = ida_alloc(&rng_index_ida, GFP_KERNEL); if (index < 0) { err = index; goto err_ida; } sprintf(vi->name, "virtio_rng.%d", index); init_completion(&vi->have_data); vi->hwrng = (struct hwrng) { .read = virtio_read, .cleanup = virtio_cleanup, .priv = (unsigned long)vi, .name = vi->name, }; vdev->priv = vi; /* We expect a single virtqueue. */ vi->vq = virtio_find_single_vq(vdev, random_recv_done, "input"); if (IS_ERR(vi->vq)) { err = PTR_ERR(vi->vq); goto err_find; } virtio_device_ready(vdev); /* we always have a pending entropy request */ request_entropy(vi); return 0; err_find: ida_free(&rng_index_ida, index); err_ida: kfree(vi); return err; } static void remove_common(struct virtio_device *vdev) { struct virtrng_info *vi = vdev->priv; vi->hwrng_removed = true; vi->data_avail = 0; vi->data_idx = 0; complete(&vi->have_data); if (vi->hwrng_register_done) hwrng_unregister(&vi->hwrng); virtio_reset_device(vdev); vdev->config->del_vqs(vdev); ida_free(&rng_index_ida, vi->index); kfree(vi); } static int virtrng_probe(struct virtio_device *vdev) { return probe_common(vdev); } static void virtrng_remove(struct virtio_device *vdev) { remove_common(vdev); } static void virtrng_scan(struct virtio_device *vdev) { struct virtrng_info *vi = vdev->priv; int err; err = hwrng_register(&vi->hwrng); if (!err) vi->hwrng_register_done = true; } static int virtrng_freeze(struct virtio_device *vdev) { remove_common(vdev); return 0; } static int virtrng_restore(struct virtio_device *vdev) { int err; err = probe_common(vdev); if (!err) { struct virtrng_info *vi = vdev->priv; /* * Set hwrng_removed to ensure that virtio_read() * does not block waiting for data before the * registration is complete. */ vi->hwrng_removed = true; err = hwrng_register(&vi->hwrng); if (!err) { vi->hwrng_register_done = true; vi->hwrng_removed = false; } } return err; } static const struct virtio_device_id id_table[] = { { VIRTIO_ID_RNG, VIRTIO_DEV_ANY_ID }, { 0 }, }; static struct virtio_driver virtio_rng_driver = { .driver.name = KBUILD_MODNAME, .id_table = id_table, .probe = virtrng_probe, .remove = virtrng_remove, .scan = virtrng_scan, .freeze = pm_sleep_ptr(virtrng_freeze), .restore = pm_sleep_ptr(virtrng_restore), }; module_virtio_driver(virtio_rng_driver); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio random number driver"); MODULE_LICENSE("GPL");
1 1 1 1 1 1 1 1 1 1 1 6 1 5 4 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 // SPDX-License-Identifier: GPL-2.0-only /* * HID driver for Logitech receivers * * Copyright (c) 2011 Logitech */ #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/kfifo.h> #include <linux/delay.h> #include <linux/usb.h> /* For to_usb_interface for kvm extra intf check */ #include <linux/unaligned.h> #include "hid-ids.h" #define DJ_MAX_PAIRED_DEVICES 7 #define DJ_MAX_NUMBER_NOTIFS 8 #define DJ_RECEIVER_INDEX 0 #define DJ_DEVICE_INDEX_MIN 1 #define DJ_DEVICE_INDEX_MAX 7 #define DJREPORT_SHORT_LENGTH 15 #define DJREPORT_LONG_LENGTH 32 #define REPORT_ID_DJ_SHORT 0x20 #define REPORT_ID_DJ_LONG 0x21 #define REPORT_ID_HIDPP_SHORT 0x10 #define REPORT_ID_HIDPP_LONG 0x11 #define REPORT_ID_HIDPP_VERY_LONG 0x12 #define HIDPP_REPORT_SHORT_LENGTH 7 #define HIDPP_REPORT_LONG_LENGTH 20 #define HIDPP_RECEIVER_INDEX 0xff #define REPORT_TYPE_RFREPORT_FIRST 0x01 #define REPORT_TYPE_RFREPORT_LAST 0x1F /* Command Switch to DJ mode */ #define REPORT_TYPE_CMD_SWITCH 0x80 #define CMD_SWITCH_PARAM_DEVBITFIELD 0x00 #define CMD_SWITCH_PARAM_TIMEOUT_SECONDS 0x01 #define TIMEOUT_NO_KEEPALIVE 0x00 /* Command to Get the list of Paired devices */ #define REPORT_TYPE_CMD_GET_PAIRED_DEVICES 0x81 /* Device Paired Notification */ #define REPORT_TYPE_NOTIF_DEVICE_PAIRED 0x41 #define SPFUNCTION_MORE_NOTIF_EXPECTED 0x01 #define SPFUNCTION_DEVICE_LIST_EMPTY 0x02 #define DEVICE_PAIRED_PARAM_SPFUNCTION 0x00 #define DEVICE_PAIRED_PARAM_EQUAD_ID_LSB 0x01 #define DEVICE_PAIRED_PARAM_EQUAD_ID_MSB 0x02 #define DEVICE_PAIRED_RF_REPORT_TYPE 0x03 /* Device Un-Paired Notification */ #define REPORT_TYPE_NOTIF_DEVICE_UNPAIRED 0x40 /* Connection Status Notification */ #define REPORT_TYPE_NOTIF_CONNECTION_STATUS 0x42 #define CONNECTION_STATUS_PARAM_STATUS 0x00 #define STATUS_LINKLOSS 0x01 /* Error Notification */ #define REPORT_TYPE_NOTIF_ERROR 0x7F #define NOTIF_ERROR_PARAM_ETYPE 0x00 #define ETYPE_KEEPALIVE_TIMEOUT 0x01 /* supported DJ HID && RF report types */ #define REPORT_TYPE_KEYBOARD 0x01 #define REPORT_TYPE_MOUSE 0x02 #define REPORT_TYPE_CONSUMER_CONTROL 0x03 #define REPORT_TYPE_SYSTEM_CONTROL 0x04 #define REPORT_TYPE_MEDIA_CENTER 0x08 #define REPORT_TYPE_LEDS 0x0E /* RF Report types bitfield */ #define STD_KEYBOARD BIT(1) #define STD_MOUSE BIT(2) #define MULTIMEDIA BIT(3) #define POWER_KEYS BIT(4) #define KBD_MOUSE BIT(5) #define MEDIA_CENTER BIT(8) #define KBD_LEDS BIT(14) /* Fake (bitnr > NUMBER_OF_HID_REPORTS) bit to track HID++ capability */ #define HIDPP BIT_ULL(63) /* HID++ Device Connected Notification */ #define REPORT_TYPE_NOTIF_DEVICE_CONNECTED 0x41 #define HIDPP_PARAM_PROTO_TYPE 0x00 #define HIDPP_PARAM_DEVICE_INFO 0x01 #define HIDPP_PARAM_EQUAD_LSB 0x02 #define HIDPP_PARAM_EQUAD_MSB 0x03 #define HIDPP_PARAM_27MHZ_DEVID 0x03 #define HIDPP_DEVICE_TYPE_MASK GENMASK(3, 0) #define HIDPP_LINK_STATUS_MASK BIT(6) #define HIDPP_MANUFACTURER_MASK BIT(7) #define HIDPP_27MHZ_SECURE_MASK BIT(7) #define HIDPP_DEVICE_TYPE_KEYBOARD 1 #define HIDPP_DEVICE_TYPE_MOUSE 2 #define HIDPP_SET_REGISTER 0x80 #define HIDPP_GET_LONG_REGISTER 0x83 #define HIDPP_REG_CONNECTION_STATE 0x02 #define HIDPP_REG_PAIRING_INFORMATION 0xB5 #define HIDPP_PAIRING_INFORMATION 0x20 #define HIDPP_FAKE_DEVICE_ARRIVAL 0x02 enum recvr_type { recvr_type_dj, recvr_type_hidpp, recvr_type_gaming_hidpp, recvr_type_mouse_only, recvr_type_27mhz, recvr_type_bluetooth, recvr_type_dinovo, }; struct dj_report { u8 report_id; u8 device_index; u8 report_type; u8 report_params[DJREPORT_SHORT_LENGTH - 3]; }; struct hidpp_event { u8 report_id; u8 device_index; u8 sub_id; u8 params[HIDPP_REPORT_LONG_LENGTH - 3U]; } __packed; struct dj_receiver_dev { struct hid_device *mouse; struct hid_device *keyboard; struct hid_device *hidpp; struct dj_device *paired_dj_devices[DJ_MAX_PAIRED_DEVICES + DJ_DEVICE_INDEX_MIN]; struct list_head list; struct kref kref; struct work_struct work; struct kfifo notif_fifo; unsigned long last_query; /* in jiffies */ bool ready; enum recvr_type type; unsigned int unnumbered_application; spinlock_t lock; }; struct dj_device { struct hid_device *hdev; struct dj_receiver_dev *dj_receiver_dev; u64 reports_supported; u8 device_index; }; #define WORKITEM_TYPE_EMPTY 0 #define WORKITEM_TYPE_PAIRED 1 #define WORKITEM_TYPE_UNPAIRED 2 #define WORKITEM_TYPE_UNKNOWN 255 struct dj_workitem { u8 type; /* WORKITEM_TYPE_* */ u8 device_index; u8 device_type; u8 quad_id_msb; u8 quad_id_lsb; u64 reports_supported; }; /* Keyboard descriptor (1) */ static const char kbd_descriptor[] = { 0x05, 0x01, /* USAGE_PAGE (generic Desktop) */ 0x09, 0x06, /* USAGE (Keyboard) */ 0xA1, 0x01, /* COLLECTION (Application) */ 0x85, 0x01, /* REPORT_ID (1) */ 0x95, 0x08, /* REPORT_COUNT (8) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ 0x05, 0x07, /* USAGE_PAGE (Keyboard) */ 0x19, 0xE0, /* USAGE_MINIMUM (Left Control) */ 0x29, 0xE7, /* USAGE_MAXIMUM (Right GUI) */ 0x81, 0x02, /* INPUT (Data,Var,Abs) */ 0x95, 0x06, /* REPORT_COUNT (6) */ 0x75, 0x08, /* REPORT_SIZE (8) */ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ 0x26, 0xFF, 0x00, /* LOGICAL_MAXIMUM (255) */ 0x05, 0x07, /* USAGE_PAGE (Keyboard) */ 0x19, 0x00, /* USAGE_MINIMUM (no event) */ 0x2A, 0xFF, 0x00, /* USAGE_MAXIMUM (reserved) */ 0x81, 0x00, /* INPUT (Data,Ary,Abs) */ 0x85, 0x0e, /* REPORT_ID (14) */ 0x05, 0x08, /* USAGE PAGE (LED page) */ 0x95, 0x05, /* REPORT COUNT (5) */ 0x75, 0x01, /* REPORT SIZE (1) */ 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ 0x19, 0x01, /* USAGE MINIMUM (1) */ 0x29, 0x05, /* USAGE MAXIMUM (5) */ 0x91, 0x02, /* OUTPUT (Data, Variable, Absolute) */ 0x95, 0x01, /* REPORT COUNT (1) */ 0x75, 0x03, /* REPORT SIZE (3) */ 0x91, 0x01, /* OUTPUT (Constant) */ 0xC0 }; /* Mouse descriptor (2) */ static const char mse_descriptor[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x02, /* USAGE (Mouse) */ 0xA1, 0x01, /* COLLECTION (Application) */ 0x85, 0x02, /* REPORT_ID = 2 */ 0x09, 0x01, /* USAGE (pointer) */ 0xA1, 0x00, /* COLLECTION (physical) */ 0x05, 0x09, /* USAGE_PAGE (buttons) */ 0x19, 0x01, /* USAGE_MIN (1) */ 0x29, 0x10, /* USAGE_MAX (16) */ 0x15, 0x00, /* LOGICAL_MIN (0) */ 0x25, 0x01, /* LOGICAL_MAX (1) */ 0x95, 0x10, /* REPORT_COUNT (16) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x81, 0x02, /* INPUT (data var abs) */ 0x05, 0x01, /* USAGE_PAGE (generic desktop) */ 0x16, 0x01, 0xF8, /* LOGICAL_MIN (-2047) */ 0x26, 0xFF, 0x07, /* LOGICAL_MAX (2047) */ 0x75, 0x0C, /* REPORT_SIZE (12) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x09, 0x30, /* USAGE (X) */ 0x09, 0x31, /* USAGE (Y) */ 0x81, 0x06, /* INPUT */ 0x15, 0x81, /* LOGICAL_MIN (-127) */ 0x25, 0x7F, /* LOGICAL_MAX (127) */ 0x75, 0x08, /* REPORT_SIZE (8) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x09, 0x38, /* USAGE (wheel) */ 0x81, 0x06, /* INPUT */ 0x05, 0x0C, /* USAGE_PAGE(consumer) */ 0x0A, 0x38, 0x02, /* USAGE(AC Pan) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x06, /* INPUT */ 0xC0, /* END_COLLECTION */ 0xC0, /* END_COLLECTION */ }; /* Mouse descriptor (2) for 27 MHz receiver, only 8 buttons */ static const char mse_27mhz_descriptor[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x02, /* USAGE (Mouse) */ 0xA1, 0x01, /* COLLECTION (Application) */ 0x85, 0x02, /* REPORT_ID = 2 */ 0x09, 0x01, /* USAGE (pointer) */ 0xA1, 0x00, /* COLLECTION (physical) */ 0x05, 0x09, /* USAGE_PAGE (buttons) */ 0x19, 0x01, /* USAGE_MIN (1) */ 0x29, 0x08, /* USAGE_MAX (8) */ 0x15, 0x00, /* LOGICAL_MIN (0) */ 0x25, 0x01, /* LOGICAL_MAX (1) */ 0x95, 0x08, /* REPORT_COUNT (8) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x81, 0x02, /* INPUT (data var abs) */ 0x05, 0x01, /* USAGE_PAGE (generic desktop) */ 0x16, 0x01, 0xF8, /* LOGICAL_MIN (-2047) */ 0x26, 0xFF, 0x07, /* LOGICAL_MAX (2047) */ 0x75, 0x0C, /* REPORT_SIZE (12) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x09, 0x30, /* USAGE (X) */ 0x09, 0x31, /* USAGE (Y) */ 0x81, 0x06, /* INPUT */ 0x15, 0x81, /* LOGICAL_MIN (-127) */ 0x25, 0x7F, /* LOGICAL_MAX (127) */ 0x75, 0x08, /* REPORT_SIZE (8) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x09, 0x38, /* USAGE (wheel) */ 0x81, 0x06, /* INPUT */ 0x05, 0x0C, /* USAGE_PAGE(consumer) */ 0x0A, 0x38, 0x02, /* USAGE(AC Pan) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x06, /* INPUT */ 0xC0, /* END_COLLECTION */ 0xC0, /* END_COLLECTION */ }; /* Mouse descriptor (2) for Bluetooth receiver, low-res hwheel, 12 buttons */ static const char mse_bluetooth_descriptor[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x02, /* USAGE (Mouse) */ 0xA1, 0x01, /* COLLECTION (Application) */ 0x85, 0x02, /* REPORT_ID = 2 */ 0x09, 0x01, /* USAGE (pointer) */ 0xA1, 0x00, /* COLLECTION (physical) */ 0x05, 0x09, /* USAGE_PAGE (buttons) */ 0x19, 0x01, /* USAGE_MIN (1) */ 0x29, 0x08, /* USAGE_MAX (8) */ 0x15, 0x00, /* LOGICAL_MIN (0) */ 0x25, 0x01, /* LOGICAL_MAX (1) */ 0x95, 0x08, /* REPORT_COUNT (8) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x81, 0x02, /* INPUT (data var abs) */ 0x05, 0x01, /* USAGE_PAGE (generic desktop) */ 0x16, 0x01, 0xF8, /* LOGICAL_MIN (-2047) */ 0x26, 0xFF, 0x07, /* LOGICAL_MAX (2047) */ 0x75, 0x0C, /* REPORT_SIZE (12) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x09, 0x30, /* USAGE (X) */ 0x09, 0x31, /* USAGE (Y) */ 0x81, 0x06, /* INPUT */ 0x15, 0x81, /* LOGICAL_MIN (-127) */ 0x25, 0x7F, /* LOGICAL_MAX (127) */ 0x75, 0x08, /* REPORT_SIZE (8) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x09, 0x38, /* USAGE (wheel) */ 0x81, 0x06, /* INPUT */ 0x05, 0x0C, /* USAGE_PAGE(consumer) */ 0x0A, 0x38, 0x02, /* USAGE(AC Pan) */ 0x15, 0xF9, /* LOGICAL_MIN (-7) */ 0x25, 0x07, /* LOGICAL_MAX (7) */ 0x75, 0x04, /* REPORT_SIZE (4) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x06, /* INPUT */ 0x05, 0x09, /* USAGE_PAGE (buttons) */ 0x19, 0x09, /* USAGE_MIN (9) */ 0x29, 0x0C, /* USAGE_MAX (12) */ 0x15, 0x00, /* LOGICAL_MIN (0) */ 0x25, 0x01, /* LOGICAL_MAX (1) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x95, 0x04, /* REPORT_COUNT (4) */ 0x81, 0x02, /* INPUT (Data,Var,Abs) */ 0xC0, /* END_COLLECTION */ 0xC0, /* END_COLLECTION */ }; /* Mouse descriptor (5) for Bluetooth receiver, normal-res hwheel, 8 buttons */ static const char mse5_bluetooth_descriptor[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x02, /* Usage (Mouse) */ 0xa1, 0x01, /* Collection (Application) */ 0x85, 0x05, /* Report ID (5) */ 0x09, 0x01, /* Usage (Pointer) */ 0xa1, 0x00, /* Collection (Physical) */ 0x05, 0x09, /* Usage Page (Button) */ 0x19, 0x01, /* Usage Minimum (1) */ 0x29, 0x08, /* Usage Maximum (8) */ 0x15, 0x00, /* Logical Minimum (0) */ 0x25, 0x01, /* Logical Maximum (1) */ 0x95, 0x08, /* Report Count (8) */ 0x75, 0x01, /* Report Size (1) */ 0x81, 0x02, /* Input (Data,Var,Abs) */ 0x05, 0x01, /* Usage Page (Generic Desktop) */ 0x16, 0x01, 0xf8, /* Logical Minimum (-2047) */ 0x26, 0xff, 0x07, /* Logical Maximum (2047) */ 0x75, 0x0c, /* Report Size (12) */ 0x95, 0x02, /* Report Count (2) */ 0x09, 0x30, /* Usage (X) */ 0x09, 0x31, /* Usage (Y) */ 0x81, 0x06, /* Input (Data,Var,Rel) */ 0x15, 0x81, /* Logical Minimum (-127) */ 0x25, 0x7f, /* Logical Maximum (127) */ 0x75, 0x08, /* Report Size (8) */ 0x95, 0x01, /* Report Count (1) */ 0x09, 0x38, /* Usage (Wheel) */ 0x81, 0x06, /* Input (Data,Var,Rel) */ 0x05, 0x0c, /* Usage Page (Consumer Devices) */ 0x0a, 0x38, 0x02, /* Usage (AC Pan) */ 0x15, 0x81, /* Logical Minimum (-127) */ 0x25, 0x7f, /* Logical Maximum (127) */ 0x75, 0x08, /* Report Size (8) */ 0x95, 0x01, /* Report Count (1) */ 0x81, 0x06, /* Input (Data,Var,Rel) */ 0xc0, /* End Collection */ 0xc0, /* End Collection */ }; /* Gaming Mouse descriptor (2) */ static const char mse_high_res_descriptor[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x02, /* USAGE (Mouse) */ 0xA1, 0x01, /* COLLECTION (Application) */ 0x85, 0x02, /* REPORT_ID = 2 */ 0x09, 0x01, /* USAGE (pointer) */ 0xA1, 0x00, /* COLLECTION (physical) */ 0x05, 0x09, /* USAGE_PAGE (buttons) */ 0x19, 0x01, /* USAGE_MIN (1) */ 0x29, 0x10, /* USAGE_MAX (16) */ 0x15, 0x00, /* LOGICAL_MIN (0) */ 0x25, 0x01, /* LOGICAL_MAX (1) */ 0x95, 0x10, /* REPORT_COUNT (16) */ 0x75, 0x01, /* REPORT_SIZE (1) */ 0x81, 0x02, /* INPUT (data var abs) */ 0x05, 0x01, /* USAGE_PAGE (generic desktop) */ 0x16, 0x01, 0x80, /* LOGICAL_MIN (-32767) */ 0x26, 0xFF, 0x7F, /* LOGICAL_MAX (32767) */ 0x75, 0x10, /* REPORT_SIZE (16) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x09, 0x30, /* USAGE (X) */ 0x09, 0x31, /* USAGE (Y) */ 0x81, 0x06, /* INPUT */ 0x15, 0x81, /* LOGICAL_MIN (-127) */ 0x25, 0x7F, /* LOGICAL_MAX (127) */ 0x75, 0x08, /* REPORT_SIZE (8) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x09, 0x38, /* USAGE (wheel) */ 0x81, 0x06, /* INPUT */ 0x05, 0x0C, /* USAGE_PAGE(consumer) */ 0x0A, 0x38, 0x02, /* USAGE(AC Pan) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x81, 0x06, /* INPUT */ 0xC0, /* END_COLLECTION */ 0xC0, /* END_COLLECTION */ }; /* Consumer Control descriptor (3) */ static const char consumer_descriptor[] = { 0x05, 0x0C, /* USAGE_PAGE (Consumer Devices) */ 0x09, 0x01, /* USAGE (Consumer Control) */ 0xA1, 0x01, /* COLLECTION (Application) */ 0x85, 0x03, /* REPORT_ID = 3 */ 0x75, 0x10, /* REPORT_SIZE (16) */ 0x95, 0x02, /* REPORT_COUNT (2) */ 0x15, 0x01, /* LOGICAL_MIN (1) */ 0x26, 0xFF, 0x02, /* LOGICAL_MAX (767) */ 0x19, 0x01, /* USAGE_MIN (1) */ 0x2A, 0xFF, 0x02, /* USAGE_MAX (767) */ 0x81, 0x00, /* INPUT (Data Ary Abs) */ 0xC0, /* END_COLLECTION */ }; /* */ /* System control descriptor (4) */ static const char syscontrol_descriptor[] = { 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ 0x09, 0x80, /* USAGE (System Control) */ 0xA1, 0x01, /* COLLECTION (Application) */ 0x85, 0x04, /* REPORT_ID = 4 */ 0x75, 0x02, /* REPORT_SIZE (2) */ 0x95, 0x01, /* REPORT_COUNT (1) */ 0x15, 0x01, /* LOGICAL_MIN (1) */ 0x25, 0x03, /* LOGICAL_MAX (3) */ 0x09, 0x82, /* USAGE (System Sleep) */ 0x09, 0x81, /* USAGE (System Power Down) */ 0x09, 0x83, /* USAGE (System Wake Up) */ 0x81, 0x60, /* INPUT (Data Ary Abs NPrf Null) */ 0x75, 0x06, /* REPORT_SIZE (6) */ 0x81, 0x03, /* INPUT (Cnst Var Abs) */ 0xC0, /* END_COLLECTION */ }; /* Media descriptor (8) */ static const char media_descriptor[] = { 0x06, 0xbc, 0xff, /* Usage Page 0xffbc */ 0x09, 0x88, /* Usage 0x0088 */ 0xa1, 0x01, /* BeginCollection */ 0x85, 0x08, /* Report ID 8 */ 0x19, 0x01, /* Usage Min 0x0001 */ 0x29, 0xff, /* Usage Max 0x00ff */ 0x15, 0x01, /* Logical Min 1 */ 0x26, 0xff, 0x00, /* Logical Max 255 */ 0x75, 0x08, /* Report Size 8 */ 0x95, 0x01, /* Report Count 1 */ 0x81, 0x00, /* Input */ 0xc0, /* EndCollection */ }; /* */ /* HIDPP descriptor */ static const char hidpp_descriptor[] = { 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ 0x09, 0x01, /* Usage (Vendor Usage 1) */ 0xa1, 0x01, /* Collection (Application) */ 0x85, 0x10, /* Report ID (16) */ 0x75, 0x08, /* Report Size (8) */ 0x95, 0x06, /* Report Count (6) */ 0x15, 0x00, /* Logical Minimum (0) */ 0x26, 0xff, 0x00, /* Logical Maximum (255) */ 0x09, 0x01, /* Usage (Vendor Usage 1) */ 0x81, 0x00, /* Input (Data,Arr,Abs) */ 0x09, 0x01, /* Usage (Vendor Usage 1) */ 0x91, 0x00, /* Output (Data,Arr,Abs) */ 0xc0, /* End Collection */ 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ 0x09, 0x02, /* Usage (Vendor Usage 2) */ 0xa1, 0x01, /* Collection (Application) */ 0x85, 0x11, /* Report ID (17) */ 0x75, 0x08, /* Report Size (8) */ 0x95, 0x13, /* Report Count (19) */ 0x15, 0x00, /* Logical Minimum (0) */ 0x26, 0xff, 0x00, /* Logical Maximum (255) */ 0x09, 0x02, /* Usage (Vendor Usage 2) */ 0x81, 0x00, /* Input (Data,Arr,Abs) */ 0x09, 0x02, /* Usage (Vendor Usage 2) */ 0x91, 0x00, /* Output (Data,Arr,Abs) */ 0xc0, /* End Collection */ 0x06, 0x00, 0xff, /* Usage Page (Vendor Defined Page 1) */ 0x09, 0x04, /* Usage (Vendor Usage 0x04) */ 0xa1, 0x01, /* Collection (Application) */ 0x85, 0x20, /* Report ID (32) */ 0x75, 0x08, /* Report Size (8) */ 0x95, 0x0e, /* Report Count (14) */ 0x15, 0x00, /* Logical Minimum (0) */ 0x26, 0xff, 0x00, /* Logical Maximum (255) */ 0x09, 0x41, /* Usage (Vendor Usage 0x41) */ 0x81, 0x00, /* Input (Data,Arr,Abs) */ 0x09, 0x41, /* Usage (Vendor Usage 0x41) */ 0x91, 0x00, /* Output (Data,Arr,Abs) */ 0x85, 0x21, /* Report ID (33) */ 0x95, 0x1f, /* Report Count (31) */ 0x15, 0x00, /* Logical Minimum (0) */ 0x26, 0xff, 0x00, /* Logical Maximum (255) */ 0x09, 0x42, /* Usage (Vendor Usage 0x42) */ 0x81, 0x00, /* Input (Data,Arr,Abs) */ 0x09, 0x42, /* Usage (Vendor Usage 0x42) */ 0x91, 0x00, /* Output (Data,Arr,Abs) */ 0xc0, /* End Collection */ }; /* Maximum size of all defined hid reports in bytes (including report id) */ #define MAX_REPORT_SIZE 8 /* Make sure all descriptors are present here */ #define MAX_RDESC_SIZE \ (sizeof(kbd_descriptor) + \ sizeof(mse_bluetooth_descriptor) + \ sizeof(mse5_bluetooth_descriptor) + \ sizeof(consumer_descriptor) + \ sizeof(syscontrol_descriptor) + \ sizeof(media_descriptor) + \ sizeof(hidpp_descriptor)) /* Number of possible hid report types that can be created by this driver. * * Right now, RF report types have the same report types (or report id's) * than the hid report created from those RF reports. In the future * this doesnt have to be true. * * For instance, RF report type 0x01 which has a size of 8 bytes, corresponds * to hid report id 0x01, this is standard keyboard. Same thing applies to mice * reports and consumer control, etc. If a new RF report is created, it doesn't * has to have the same report id as its corresponding hid report, so an * translation may have to take place for future report types. */ #define NUMBER_OF_HID_REPORTS 32 static const u8 hid_reportid_size_map[NUMBER_OF_HID_REPORTS] = { [1] = 8, /* Standard keyboard */ [2] = 8, /* Standard mouse */ [3] = 5, /* Consumer control */ [4] = 2, /* System control */ [8] = 2, /* Media Center */ }; #define LOGITECH_DJ_INTERFACE_NUMBER 0x02 static const struct hid_ll_driver logi_dj_ll_driver; static int logi_dj_recv_query_paired_devices(struct dj_receiver_dev *djrcv_dev); static void delayedwork_callback(struct work_struct *work); static LIST_HEAD(dj_hdev_list); static DEFINE_MUTEX(dj_hdev_list_lock); static bool recvr_type_is_bluetooth(enum recvr_type type) { return type == recvr_type_bluetooth || type == recvr_type_dinovo; } /* * dj/HID++ receivers are really a single logical entity, but for BIOS/Windows * compatibility they have multiple USB interfaces. On HID++ receivers we need * to listen for input reports on both interfaces. The functions below are used * to create a single struct dj_receiver_dev for all interfaces belonging to * a single USB-device / receiver. */ static struct dj_receiver_dev *dj_find_receiver_dev(struct hid_device *hdev, enum recvr_type type) { struct dj_receiver_dev *djrcv_dev; char sep; /* * The bluetooth receiver contains a built-in hub and has separate * USB-devices for the keyboard and mouse interfaces. */ sep = recvr_type_is_bluetooth(type) ? '.' : '/'; /* Try to find an already-probed interface from the same device */ list_for_each_entry(djrcv_dev, &dj_hdev_list, list) { if (djrcv_dev->mouse && hid_compare_device_paths(hdev, djrcv_dev->mouse, sep)) { kref_get(&djrcv_dev->kref); return djrcv_dev; } if (djrcv_dev->keyboard && hid_compare_device_paths(hdev, djrcv_dev->keyboard, sep)) { kref_get(&djrcv_dev->kref); return djrcv_dev; } if (djrcv_dev->hidpp && hid_compare_device_paths(hdev, djrcv_dev->hidpp, sep)) { kref_get(&djrcv_dev->kref); return djrcv_dev; } } return NULL; } static void dj_release_receiver_dev(struct kref *kref) { struct dj_receiver_dev *djrcv_dev = container_of(kref, struct dj_receiver_dev, kref); list_del(&djrcv_dev->list); kfifo_free(&djrcv_dev->notif_fifo); kfree(djrcv_dev); } static void dj_put_receiver_dev(struct hid_device *hdev) { struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); mutex_lock(&dj_hdev_list_lock); if (djrcv_dev->mouse == hdev) djrcv_dev->mouse = NULL; if (djrcv_dev->keyboard == hdev) djrcv_dev->keyboard = NULL; if (djrcv_dev->hidpp == hdev) djrcv_dev->hidpp = NULL; kref_put(&djrcv_dev->kref, dj_release_receiver_dev); mutex_unlock(&dj_hdev_list_lock); } static struct dj_receiver_dev *dj_get_receiver_dev(struct hid_device *hdev, enum recvr_type type, unsigned int application, bool is_hidpp) { struct dj_receiver_dev *djrcv_dev; mutex_lock(&dj_hdev_list_lock); djrcv_dev = dj_find_receiver_dev(hdev, type); if (!djrcv_dev) { djrcv_dev = kzalloc(sizeof(*djrcv_dev), GFP_KERNEL); if (!djrcv_dev) goto out; INIT_WORK(&djrcv_dev->work, delayedwork_callback); spin_lock_init(&djrcv_dev->lock); if (kfifo_alloc(&djrcv_dev->notif_fifo, DJ_MAX_NUMBER_NOTIFS * sizeof(struct dj_workitem), GFP_KERNEL)) { kfree(djrcv_dev); djrcv_dev = NULL; goto out; } kref_init(&djrcv_dev->kref); list_add_tail(&djrcv_dev->list, &dj_hdev_list); djrcv_dev->last_query = jiffies; djrcv_dev->type = type; } if (application == HID_GD_KEYBOARD) djrcv_dev->keyboard = hdev; if (application == HID_GD_MOUSE) djrcv_dev->mouse = hdev; if (is_hidpp) djrcv_dev->hidpp = hdev; hid_set_drvdata(hdev, djrcv_dev); out: mutex_unlock(&dj_hdev_list_lock); return djrcv_dev; } static void logi_dj_recv_destroy_djhid_device(struct dj_receiver_dev *djrcv_dev, struct dj_workitem *workitem) { /* Called in delayed work context */ struct dj_device *dj_dev; unsigned long flags; spin_lock_irqsave(&djrcv_dev->lock, flags); dj_dev = djrcv_dev->paired_dj_devices[workitem->device_index]; djrcv_dev->paired_dj_devices[workitem->device_index] = NULL; spin_unlock_irqrestore(&djrcv_dev->lock, flags); if (dj_dev != NULL) { hid_destroy_device(dj_dev->hdev); kfree(dj_dev); } else { hid_err(djrcv_dev->hidpp, "%s: can't destroy a NULL device\n", __func__); } } static void logi_dj_recv_add_djhid_device(struct dj_receiver_dev *djrcv_dev, struct dj_workitem *workitem) { /* Called in delayed work context */ struct hid_device *djrcv_hdev = djrcv_dev->hidpp; struct hid_device *dj_hiddev; struct dj_device *dj_dev; u8 device_index = workitem->device_index; unsigned long flags; /* Device index goes from 1 to 6, we need 3 bytes to store the * semicolon, the index, and a null terminator */ unsigned char tmpstr[3]; /* We are the only one ever adding a device, no need to lock */ if (djrcv_dev->paired_dj_devices[device_index]) { /* The device is already known. No need to reallocate it. */ dbg_hid("%s: device is already known\n", __func__); return; } dj_hiddev = hid_allocate_device(); if (IS_ERR(dj_hiddev)) { hid_err(djrcv_hdev, "%s: hid_allocate_dev failed\n", __func__); return; } dj_hiddev->ll_driver = &logi_dj_ll_driver; dj_hiddev->dev.parent = &djrcv_hdev->dev; dj_hiddev->bus = BUS_USB; dj_hiddev->vendor = djrcv_hdev->vendor; dj_hiddev->product = (workitem->quad_id_msb << 8) | workitem->quad_id_lsb; if (workitem->device_type) { const char *type_str = "Device"; switch (workitem->device_type) { case 0x01: type_str = "Keyboard"; break; case 0x02: type_str = "Mouse"; break; case 0x03: type_str = "Numpad"; break; case 0x04: type_str = "Presenter"; break; case 0x07: type_str = "Remote Control"; break; case 0x08: type_str = "Trackball"; break; case 0x09: type_str = "Touchpad"; break; } snprintf(dj_hiddev->name, sizeof(dj_hiddev->name), "Logitech Wireless %s PID:%04x", type_str, dj_hiddev->product); } else { snprintf(dj_hiddev->name, sizeof(dj_hiddev->name), "Logitech Wireless Device PID:%04x", dj_hiddev->product); } if (djrcv_dev->type == recvr_type_27mhz) dj_hiddev->group = HID_GROUP_LOGITECH_27MHZ_DEVICE; else dj_hiddev->group = HID_GROUP_LOGITECH_DJ_DEVICE; memcpy(dj_hiddev->phys, djrcv_hdev->phys, sizeof(djrcv_hdev->phys)); snprintf(tmpstr, sizeof(tmpstr), ":%d", device_index); strlcat(dj_hiddev->phys, tmpstr, sizeof(dj_hiddev->phys)); dj_dev = kzalloc(sizeof(struct dj_device), GFP_KERNEL); if (!dj_dev) { hid_err(djrcv_hdev, "%s: failed allocating dj_dev\n", __func__); goto dj_device_allocate_fail; } dj_dev->reports_supported = workitem->reports_supported; dj_dev->hdev = dj_hiddev; dj_dev->dj_receiver_dev = djrcv_dev; dj_dev->device_index = device_index; dj_hiddev->driver_data = dj_dev; spin_lock_irqsave(&djrcv_dev->lock, flags); djrcv_dev->paired_dj_devices[device_index] = dj_dev; spin_unlock_irqrestore(&djrcv_dev->lock, flags); if (hid_add_device(dj_hiddev)) { hid_err(djrcv_hdev, "%s: failed adding dj_device\n", __func__); goto hid_add_device_fail; } return; hid_add_device_fail: spin_lock_irqsave(&djrcv_dev->lock, flags); djrcv_dev->paired_dj_devices[device_index] = NULL; spin_unlock_irqrestore(&djrcv_dev->lock, flags); kfree(dj_dev); dj_device_allocate_fail: hid_destroy_device(dj_hiddev); } static void delayedwork_callback(struct work_struct *work) { struct dj_receiver_dev *djrcv_dev = container_of(work, struct dj_receiver_dev, work); struct dj_workitem workitem; unsigned long flags; int count; int retval; dbg_hid("%s\n", __func__); spin_lock_irqsave(&djrcv_dev->lock, flags); /* * Since we attach to multiple interfaces, we may get scheduled before * we are bound to the HID++ interface, catch this. */ if (!djrcv_dev->ready) { pr_warn("%s: delayedwork queued before hidpp interface was enumerated\n", __func__); spin_unlock_irqrestore(&djrcv_dev->lock, flags); return; } count = kfifo_out(&djrcv_dev->notif_fifo, &workitem, sizeof(workitem)); if (count != sizeof(workitem)) { spin_unlock_irqrestore(&djrcv_dev->lock, flags); return; } if (!kfifo_is_empty(&djrcv_dev->notif_fifo)) schedule_work(&djrcv_dev->work); spin_unlock_irqrestore(&djrcv_dev->lock, flags); switch (workitem.type) { case WORKITEM_TYPE_PAIRED: logi_dj_recv_add_djhid_device(djrcv_dev, &workitem); break; case WORKITEM_TYPE_UNPAIRED: logi_dj_recv_destroy_djhid_device(djrcv_dev, &workitem); break; case WORKITEM_TYPE_UNKNOWN: retval = logi_dj_recv_query_paired_devices(djrcv_dev); if (retval) { hid_err(djrcv_dev->hidpp, "%s: logi_dj_recv_query_paired_devices error: %d\n", __func__, retval); } break; case WORKITEM_TYPE_EMPTY: dbg_hid("%s: device list is empty\n", __func__); break; } } /* * Sometimes we receive reports for which we do not have a paired dj_device * associated with the device_index or report-type to forward the report to. * This means that the original "device paired" notification corresponding * to the dj_device never arrived to this driver. Possible reasons for this are: * 1) hid-core discards all packets coming from a device during probe(). * 2) if the receiver is plugged into a KVM switch then the pairing reports * are only forwarded to it if the focus is on this PC. * This function deals with this by re-asking the receiver for the list of * connected devices in the delayed work callback. * This function MUST be called with djrcv->lock held. */ static void logi_dj_recv_queue_unknown_work(struct dj_receiver_dev *djrcv_dev) { struct dj_workitem workitem = { .type = WORKITEM_TYPE_UNKNOWN }; /* Rate limit queries done because of unhandled reports to 2/sec */ if (time_before(jiffies, djrcv_dev->last_query + HZ / 2)) return; kfifo_in(&djrcv_dev->notif_fifo, &workitem, sizeof(workitem)); schedule_work(&djrcv_dev->work); } static void logi_dj_recv_queue_notification(struct dj_receiver_dev *djrcv_dev, struct dj_report *dj_report) { /* We are called from atomic context (tasklet && djrcv->lock held) */ struct dj_workitem workitem = { .device_index = dj_report->device_index, }; switch (dj_report->report_type) { case REPORT_TYPE_NOTIF_DEVICE_PAIRED: workitem.type = WORKITEM_TYPE_PAIRED; if (dj_report->report_params[DEVICE_PAIRED_PARAM_SPFUNCTION] & SPFUNCTION_DEVICE_LIST_EMPTY) { workitem.type = WORKITEM_TYPE_EMPTY; break; } fallthrough; case REPORT_TYPE_NOTIF_DEVICE_UNPAIRED: workitem.quad_id_msb = dj_report->report_params[DEVICE_PAIRED_PARAM_EQUAD_ID_MSB]; workitem.quad_id_lsb = dj_report->report_params[DEVICE_PAIRED_PARAM_EQUAD_ID_LSB]; workitem.reports_supported = get_unaligned_le32( dj_report->report_params + DEVICE_PAIRED_RF_REPORT_TYPE); workitem.reports_supported |= HIDPP; if (dj_report->report_type == REPORT_TYPE_NOTIF_DEVICE_UNPAIRED) workitem.type = WORKITEM_TYPE_UNPAIRED; break; default: logi_dj_recv_queue_unknown_work(djrcv_dev); return; } kfifo_in(&djrcv_dev->notif_fifo, &workitem, sizeof(workitem)); schedule_work(&djrcv_dev->work); } /* * Some quad/bluetooth keyboards have a builtin touchpad in this case we see * only 1 paired device with a device_type of REPORT_TYPE_KEYBOARD. For the * touchpad to work we must also forward mouse input reports to the dj_hiddev * created for the keyboard (instead of forwarding them to a second paired * device with a device_type of REPORT_TYPE_MOUSE as we normally would). * * On Dinovo receivers the keyboard's touchpad and an optional paired actual * mouse send separate input reports, INPUT(2) aka STD_MOUSE for the mouse * and INPUT(5) aka KBD_MOUSE for the keyboard's touchpad. * * On MX5x00 receivers (which can also be paired with a Dinovo keyboard) * INPUT(2) is used for both an optional paired actual mouse and for the * keyboard's touchpad. */ static const u16 kbd_builtin_touchpad_ids[] = { 0xb309, /* Dinovo Edge */ 0xb30c, /* Dinovo Mini */ }; static void logi_hidpp_dev_conn_notif_equad(struct hid_device *hdev, struct hidpp_event *hidpp_report, struct dj_workitem *workitem) { struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); int i, id; workitem->type = WORKITEM_TYPE_PAIRED; workitem->device_type = hidpp_report->params[HIDPP_PARAM_DEVICE_INFO] & HIDPP_DEVICE_TYPE_MASK; workitem->quad_id_msb = hidpp_report->params[HIDPP_PARAM_EQUAD_MSB]; workitem->quad_id_lsb = hidpp_report->params[HIDPP_PARAM_EQUAD_LSB]; switch (workitem->device_type) { case REPORT_TYPE_KEYBOARD: workitem->reports_supported |= STD_KEYBOARD | MULTIMEDIA | POWER_KEYS | MEDIA_CENTER | HIDPP; id = (workitem->quad_id_msb << 8) | workitem->quad_id_lsb; for (i = 0; i < ARRAY_SIZE(kbd_builtin_touchpad_ids); i++) { if (id == kbd_builtin_touchpad_ids[i]) { if (djrcv_dev->type == recvr_type_dinovo) workitem->reports_supported |= KBD_MOUSE; else workitem->reports_supported |= STD_MOUSE; break; } } break; case REPORT_TYPE_MOUSE: workitem->reports_supported |= STD_MOUSE | HIDPP | MULTIMEDIA; break; } } static void logi_hidpp_dev_conn_notif_27mhz(struct hid_device *hdev, struct hidpp_event *hidpp_report, struct dj_workitem *workitem) { workitem->type = WORKITEM_TYPE_PAIRED; workitem->quad_id_lsb = hidpp_report->params[HIDPP_PARAM_27MHZ_DEVID]; switch (hidpp_report->device_index) { case 1: /* Index 1 is always a mouse */ case 2: /* Index 2 is always a mouse */ workitem->device_type = HIDPP_DEVICE_TYPE_MOUSE; workitem->reports_supported |= STD_MOUSE | HIDPP; break; case 3: /* Index 3 is always the keyboard */ if (hidpp_report->params[HIDPP_PARAM_DEVICE_INFO] & HIDPP_27MHZ_SECURE_MASK) { hid_info(hdev, "Keyboard connection is encrypted\n"); } else { hid_warn(hdev, "Keyboard events are send over the air in plain-text / unencrypted\n"); hid_warn(hdev, "See: https://gitlab.freedesktop.org/jwrdegoede/logitech-27mhz-keyboard-encryption-setup/\n"); } fallthrough; case 4: /* Index 4 is used for an optional separate numpad */ workitem->device_type = HIDPP_DEVICE_TYPE_KEYBOARD; workitem->reports_supported |= STD_KEYBOARD | MULTIMEDIA | POWER_KEYS | HIDPP; break; default: hid_warn(hdev, "%s: unexpected device-index %d", __func__, hidpp_report->device_index); } } static void logi_hidpp_recv_queue_notif(struct hid_device *hdev, struct hidpp_event *hidpp_report) { /* We are called from atomic context (tasklet && djrcv->lock held) */ struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); const char *device_type = "UNKNOWN"; struct dj_workitem workitem = { .type = WORKITEM_TYPE_EMPTY, .device_index = hidpp_report->device_index, }; switch (hidpp_report->params[HIDPP_PARAM_PROTO_TYPE]) { case 0x01: device_type = "Bluetooth"; /* Bluetooth connect packet contents is the same as (e)QUAD */ logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); if (!(hidpp_report->params[HIDPP_PARAM_DEVICE_INFO] & HIDPP_MANUFACTURER_MASK)) { hid_info(hdev, "Non Logitech device connected on slot %d\n", hidpp_report->device_index); workitem.reports_supported &= ~HIDPP; } break; case 0x02: device_type = "27 Mhz"; logi_hidpp_dev_conn_notif_27mhz(hdev, hidpp_report, &workitem); break; case 0x03: device_type = "QUAD or eQUAD"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x04: device_type = "eQUAD step 4 DJ"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x05: device_type = "DFU Lite"; break; case 0x06: device_type = "eQUAD step 4 Lite"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x07: device_type = "eQUAD step 4 Gaming"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; break; case 0x08: device_type = "eQUAD step 4 for gamepads"; break; case 0x0a: device_type = "eQUAD nano Lite"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); break; case 0x0c: device_type = "eQUAD Lightspeed 1"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; break; case 0x0d: device_type = "eQUAD Lightspeed 1.1"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; break; case 0x0f: case 0x11: device_type = "eQUAD Lightspeed 1.2"; logi_hidpp_dev_conn_notif_equad(hdev, hidpp_report, &workitem); workitem.reports_supported |= STD_KEYBOARD; break; } /* custom receiver device (eg. powerplay) */ if (hidpp_report->device_index == 7) { workitem.reports_supported |= HIDPP; } if (workitem.type == WORKITEM_TYPE_EMPTY) { hid_warn(hdev, "unusable device of type %s (0x%02x) connected on slot %d", device_type, hidpp_report->params[HIDPP_PARAM_PROTO_TYPE], hidpp_report->device_index); return; } hid_info(hdev, "device of type %s (0x%02x) connected on slot %d", device_type, hidpp_report->params[HIDPP_PARAM_PROTO_TYPE], hidpp_report->device_index); kfifo_in(&djrcv_dev->notif_fifo, &workitem, sizeof(workitem)); schedule_work(&djrcv_dev->work); } static void logi_dj_recv_forward_null_report(struct dj_receiver_dev *djrcv_dev, struct dj_report *dj_report) { /* We are called from atomic context (tasklet && djrcv->lock held) */ unsigned int i; u8 reportbuffer[MAX_REPORT_SIZE]; struct dj_device *djdev; djdev = djrcv_dev->paired_dj_devices[dj_report->device_index]; memset(reportbuffer, 0, sizeof(reportbuffer)); for (i = 0; i < NUMBER_OF_HID_REPORTS; i++) { if (djdev->reports_supported & (1 << i)) { reportbuffer[0] = i; if (hid_input_report(djdev->hdev, HID_INPUT_REPORT, reportbuffer, hid_reportid_size_map[i], 1)) { dbg_hid("hid_input_report error sending null " "report\n"); } } } } static void logi_dj_recv_forward_dj(struct dj_receiver_dev *djrcv_dev, struct dj_report *dj_report) { /* We are called from atomic context (tasklet && djrcv->lock held) */ struct dj_device *dj_device; dj_device = djrcv_dev->paired_dj_devices[dj_report->device_index]; if ((dj_report->report_type > ARRAY_SIZE(hid_reportid_size_map) - 1) || (hid_reportid_size_map[dj_report->report_type] == 0)) { dbg_hid("invalid report type:%x\n", dj_report->report_type); return; } if (hid_input_report(dj_device->hdev, HID_INPUT_REPORT, &dj_report->report_type, hid_reportid_size_map[dj_report->report_type], 1)) { dbg_hid("hid_input_report error\n"); } } static void logi_dj_recv_forward_report(struct dj_device *dj_dev, u8 *data, int size) { /* We are called from atomic context (tasklet && djrcv->lock held) */ if (hid_input_report(dj_dev->hdev, HID_INPUT_REPORT, data, size, 1)) dbg_hid("hid_input_report error\n"); } static void logi_dj_recv_forward_input_report(struct hid_device *hdev, u8 *data, int size) { struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); struct dj_device *dj_dev; unsigned long flags; u8 report = data[0]; int i; if (report > REPORT_TYPE_RFREPORT_LAST) { hid_err(hdev, "Unexpected input report number %d\n", report); return; } spin_lock_irqsave(&djrcv_dev->lock, flags); for (i = 0; i < (DJ_MAX_PAIRED_DEVICES + DJ_DEVICE_INDEX_MIN); i++) { dj_dev = djrcv_dev->paired_dj_devices[i]; if (dj_dev && (dj_dev->reports_supported & BIT(report))) { logi_dj_recv_forward_report(dj_dev, data, size); spin_unlock_irqrestore(&djrcv_dev->lock, flags); return; } } logi_dj_recv_queue_unknown_work(djrcv_dev); spin_unlock_irqrestore(&djrcv_dev->lock, flags); dbg_hid("No dj-devs handling input report number %d\n", report); } static int logi_dj_recv_send_report(struct dj_receiver_dev *djrcv_dev, struct dj_report *dj_report) { struct hid_device *hdev = djrcv_dev->hidpp; struct hid_report *report; struct hid_report_enum *output_report_enum; u8 *data = (u8 *)(&dj_report->device_index); unsigned int i; output_report_enum = &hdev->report_enum[HID_OUTPUT_REPORT]; report = output_report_enum->report_id_hash[REPORT_ID_DJ_SHORT]; if (!report) { hid_err(hdev, "%s: unable to find dj report\n", __func__); return -ENODEV; } for (i = 0; i < DJREPORT_SHORT_LENGTH - 1; i++) report->field[0]->value[i] = data[i]; hid_hw_request(hdev, report, HID_REQ_SET_REPORT); return 0; } static int logi_dj_recv_query_hidpp_devices(struct dj_receiver_dev *djrcv_dev) { static const u8 template[] = { REPORT_ID_HIDPP_SHORT, HIDPP_RECEIVER_INDEX, HIDPP_SET_REGISTER, HIDPP_REG_CONNECTION_STATE, HIDPP_FAKE_DEVICE_ARRIVAL, 0x00, 0x00 }; u8 *hidpp_report; int retval; hidpp_report = kmemdup(template, sizeof(template), GFP_KERNEL); if (!hidpp_report) return -ENOMEM; retval = hid_hw_raw_request(djrcv_dev->hidpp, REPORT_ID_HIDPP_SHORT, hidpp_report, sizeof(template), HID_OUTPUT_REPORT, HID_REQ_SET_REPORT); kfree(hidpp_report); return (retval < 0) ? retval : 0; } static int logi_dj_recv_query_paired_devices(struct dj_receiver_dev *djrcv_dev) { struct dj_report *dj_report; int retval; djrcv_dev->last_query = jiffies; if (djrcv_dev->type != recvr_type_dj) return logi_dj_recv_query_hidpp_devices(djrcv_dev); dj_report = kzalloc(sizeof(struct dj_report), GFP_KERNEL); if (!dj_report) return -ENOMEM; dj_report->report_id = REPORT_ID_DJ_SHORT; dj_report->device_index = HIDPP_RECEIVER_INDEX; dj_report->report_type = REPORT_TYPE_CMD_GET_PAIRED_DEVICES; retval = logi_dj_recv_send_report(djrcv_dev, dj_report); kfree(dj_report); return retval; } static int logi_dj_recv_switch_to_dj_mode(struct dj_receiver_dev *djrcv_dev, unsigned timeout) { struct hid_device *hdev = djrcv_dev->hidpp; struct dj_report *dj_report; u8 *buf; int retval = 0; dj_report = kzalloc(sizeof(struct dj_report), GFP_KERNEL); if (!dj_report) return -ENOMEM; if (djrcv_dev->type == recvr_type_dj) { dj_report->report_id = REPORT_ID_DJ_SHORT; dj_report->device_index = HIDPP_RECEIVER_INDEX; dj_report->report_type = REPORT_TYPE_CMD_SWITCH; dj_report->report_params[CMD_SWITCH_PARAM_DEVBITFIELD] = 0x3F; dj_report->report_params[CMD_SWITCH_PARAM_TIMEOUT_SECONDS] = (u8)timeout; retval = logi_dj_recv_send_report(djrcv_dev, dj_report); /* * Ugly sleep to work around a USB 3.0 bug when the receiver is * still processing the "switch-to-dj" command while we send an * other command. * 50 msec should gives enough time to the receiver to be ready. */ msleep(50); if (retval) { kfree(dj_report); return retval; } } /* * Magical bits to set up hidpp notifications when the dj devices * are connected/disconnected. * * We can reuse dj_report because HIDPP_REPORT_SHORT_LENGTH is smaller * than DJREPORT_SHORT_LENGTH. */ buf = (u8 *)dj_report; memset(buf, 0, HIDPP_REPORT_SHORT_LENGTH); buf[0] = REPORT_ID_HIDPP_SHORT; buf[1] = HIDPP_RECEIVER_INDEX; buf[2] = 0x80; buf[3] = 0x00; buf[4] = 0x00; buf[5] = 0x09; buf[6] = 0x00; retval = hid_hw_raw_request(hdev, REPORT_ID_HIDPP_SHORT, buf, HIDPP_REPORT_SHORT_LENGTH, HID_OUTPUT_REPORT, HID_REQ_SET_REPORT); kfree(dj_report); return retval; } static int logi_dj_ll_open(struct hid_device *hid) { dbg_hid("%s: %s\n", __func__, hid->phys); return 0; } static void logi_dj_ll_close(struct hid_device *hid) { dbg_hid("%s: %s\n", __func__, hid->phys); } /* * Register 0xB5 is "pairing information". It is solely intended for the * receiver, so do not overwrite the device index. */ static u8 unifying_pairing_query[] = { REPORT_ID_HIDPP_SHORT, HIDPP_RECEIVER_INDEX, HIDPP_GET_LONG_REGISTER, HIDPP_REG_PAIRING_INFORMATION }; static u8 unifying_pairing_answer[] = { REPORT_ID_HIDPP_LONG, HIDPP_RECEIVER_INDEX, HIDPP_GET_LONG_REGISTER, HIDPP_REG_PAIRING_INFORMATION }; static int logi_dj_ll_raw_request(struct hid_device *hid, unsigned char reportnum, __u8 *buf, size_t count, unsigned char report_type, int reqtype) { struct dj_device *djdev = hid->driver_data; struct dj_receiver_dev *djrcv_dev = djdev->dj_receiver_dev; u8 *out_buf; int ret; if ((buf[0] == REPORT_ID_HIDPP_SHORT) || (buf[0] == REPORT_ID_HIDPP_LONG) || (buf[0] == REPORT_ID_HIDPP_VERY_LONG)) { if (count < 2) return -EINVAL; /* special case where we should not overwrite * the device_index */ if (count == 7 && !memcmp(buf, unifying_pairing_query, sizeof(unifying_pairing_query))) buf[4] = (buf[4] & 0xf0) | (djdev->device_index - 1); else buf[1] = djdev->device_index; return hid_hw_raw_request(djrcv_dev->hidpp, reportnum, buf, count, report_type, reqtype); } if (buf[0] != REPORT_TYPE_LEDS) return -EINVAL; if (djrcv_dev->type != recvr_type_dj && count >= 2) { if (!djrcv_dev->keyboard) { hid_warn(hid, "Received REPORT_TYPE_LEDS request before the keyboard interface was enumerated\n"); return 0; } /* usbhid overrides the report ID and ignores the first byte */ return hid_hw_raw_request(djrcv_dev->keyboard, 0, buf, count, report_type, reqtype); } out_buf = kzalloc(DJREPORT_SHORT_LENGTH, GFP_ATOMIC); if (!out_buf) return -ENOMEM; if (count > DJREPORT_SHORT_LENGTH - 2) count = DJREPORT_SHORT_LENGTH - 2; out_buf[0] = REPORT_ID_DJ_SHORT; out_buf[1] = djdev->device_index; memcpy(out_buf + 2, buf, count); ret = hid_hw_raw_request(djrcv_dev->hidpp, out_buf[0], out_buf, DJREPORT_SHORT_LENGTH, report_type, reqtype); kfree(out_buf); return ret; } static void rdcat(char *rdesc, unsigned int *rsize, const char *data, unsigned int size) { memcpy(rdesc + *rsize, data, size); *rsize += size; } static int logi_dj_ll_parse(struct hid_device *hid) { struct dj_device *djdev = hid->driver_data; unsigned int rsize = 0; char *rdesc; int retval; dbg_hid("%s\n", __func__); djdev->hdev->version = 0x0111; djdev->hdev->country = 0x00; rdesc = kmalloc(MAX_RDESC_SIZE, GFP_KERNEL); if (!rdesc) return -ENOMEM; if (djdev->reports_supported & STD_KEYBOARD) { dbg_hid("%s: sending a kbd descriptor, reports_supported: %llx\n", __func__, djdev->reports_supported); rdcat(rdesc, &rsize, kbd_descriptor, sizeof(kbd_descriptor)); } if (djdev->reports_supported & STD_MOUSE) { dbg_hid("%s: sending a mouse descriptor, reports_supported: %llx\n", __func__, djdev->reports_supported); if (djdev->dj_receiver_dev->type == recvr_type_gaming_hidpp || djdev->dj_receiver_dev->type == recvr_type_mouse_only) rdcat(rdesc, &rsize, mse_high_res_descriptor, sizeof(mse_high_res_descriptor)); else if (djdev->dj_receiver_dev->type == recvr_type_27mhz) rdcat(rdesc, &rsize, mse_27mhz_descriptor, sizeof(mse_27mhz_descriptor)); else if (recvr_type_is_bluetooth(djdev->dj_receiver_dev->type)) rdcat(rdesc, &rsize, mse_bluetooth_descriptor, sizeof(mse_bluetooth_descriptor)); else rdcat(rdesc, &rsize, mse_descriptor, sizeof(mse_descriptor)); } if (djdev->reports_supported & KBD_MOUSE) { dbg_hid("%s: sending a kbd-mouse descriptor, reports_supported: %llx\n", __func__, djdev->reports_supported); rdcat(rdesc, &rsize, mse5_bluetooth_descriptor, sizeof(mse5_bluetooth_descriptor)); } if (djdev->reports_supported & MULTIMEDIA) { dbg_hid("%s: sending a multimedia report descriptor: %llx\n", __func__, djdev->reports_supported); rdcat(rdesc, &rsize, consumer_descriptor, sizeof(consumer_descriptor)); } if (djdev->reports_supported & POWER_KEYS) { dbg_hid("%s: sending a power keys report descriptor: %llx\n", __func__, djdev->reports_supported); rdcat(rdesc, &rsize, syscontrol_descriptor, sizeof(syscontrol_descriptor)); } if (djdev->reports_supported & MEDIA_CENTER) { dbg_hid("%s: sending a media center report descriptor: %llx\n", __func__, djdev->reports_supported); rdcat(rdesc, &rsize, media_descriptor, sizeof(media_descriptor)); } if (djdev->reports_supported & KBD_LEDS) { dbg_hid("%s: need to send kbd leds report descriptor: %llx\n", __func__, djdev->reports_supported); } if (djdev->reports_supported & HIDPP) { dbg_hid("%s: sending a HID++ descriptor, reports_supported: %llx\n", __func__, djdev->reports_supported); rdcat(rdesc, &rsize, hidpp_descriptor, sizeof(hidpp_descriptor)); } retval = hid_parse_report(hid, rdesc, rsize); kfree(rdesc); return retval; } static int logi_dj_ll_start(struct hid_device *hid) { dbg_hid("%s\n", __func__); return 0; } static void logi_dj_ll_stop(struct hid_device *hid) { dbg_hid("%s\n", __func__); } static bool logi_dj_ll_may_wakeup(struct hid_device *hid) { struct dj_device *djdev = hid->driver_data; struct dj_receiver_dev *djrcv_dev = djdev->dj_receiver_dev; return hid_hw_may_wakeup(djrcv_dev->hidpp); } static const struct hid_ll_driver logi_dj_ll_driver = { .parse = logi_dj_ll_parse, .start = logi_dj_ll_start, .stop = logi_dj_ll_stop, .open = logi_dj_ll_open, .close = logi_dj_ll_close, .raw_request = logi_dj_ll_raw_request, .may_wakeup = logi_dj_ll_may_wakeup, }; static int logi_dj_dj_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); struct dj_report *dj_report = (struct dj_report *) data; unsigned long flags; /* * Here we receive all data coming from iface 2, there are 3 cases: * * 1) Data is intended for this driver i. e. data contains arrival, * departure, etc notifications, in which case we queue them for delayed * processing by the work queue. We return 1 to hid-core as no further * processing is required from it. * * 2) Data informs a connection change, if the change means rf link * loss, then we must send a null report to the upper layer to discard * potentially pressed keys that may be repeated forever by the input * layer. Return 1 to hid-core as no further processing is required. * * 3) Data is an actual input event from a paired DJ device in which * case we forward it to the correct hid device (via hid_input_report() * ) and return 1 so hid-core does not anything else with it. */ if ((dj_report->device_index < DJ_DEVICE_INDEX_MIN) || (dj_report->device_index > DJ_DEVICE_INDEX_MAX)) { /* * Device index is wrong, bail out. * This driver can ignore safely the receiver notifications, * so ignore those reports too. */ if (dj_report->device_index != DJ_RECEIVER_INDEX) hid_err(hdev, "%s: invalid device index:%d\n", __func__, dj_report->device_index); return false; } spin_lock_irqsave(&djrcv_dev->lock, flags); if (!djrcv_dev->paired_dj_devices[dj_report->device_index]) { /* received an event for an unknown device, bail out */ logi_dj_recv_queue_notification(djrcv_dev, dj_report); goto out; } switch (dj_report->report_type) { case REPORT_TYPE_NOTIF_DEVICE_PAIRED: /* pairing notifications are handled above the switch */ break; case REPORT_TYPE_NOTIF_DEVICE_UNPAIRED: logi_dj_recv_queue_notification(djrcv_dev, dj_report); break; case REPORT_TYPE_NOTIF_CONNECTION_STATUS: if (dj_report->report_params[CONNECTION_STATUS_PARAM_STATUS] == STATUS_LINKLOSS) { logi_dj_recv_forward_null_report(djrcv_dev, dj_report); } break; default: logi_dj_recv_forward_dj(djrcv_dev, dj_report); } out: spin_unlock_irqrestore(&djrcv_dev->lock, flags); return true; } static int logi_dj_hidpp_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); struct hidpp_event *hidpp_report = (struct hidpp_event *) data; struct dj_device *dj_dev; unsigned long flags; u8 device_index = hidpp_report->device_index; if (device_index == HIDPP_RECEIVER_INDEX) { /* special case were the device wants to know its unifying * name */ if (size == HIDPP_REPORT_LONG_LENGTH && !memcmp(data, unifying_pairing_answer, sizeof(unifying_pairing_answer))) device_index = (data[4] & 0x0F) + 1; else return false; } /* * Data is from the HID++ collection, in this case, we forward the * data to the corresponding child dj device and return 0 to hid-core * so he data also goes to the hidraw device of the receiver. This * allows a user space application to implement the full HID++ routing * via the receiver. */ if ((device_index < DJ_DEVICE_INDEX_MIN) || (device_index > DJ_DEVICE_INDEX_MAX)) { /* * Device index is wrong, bail out. * This driver can ignore safely the receiver notifications, * so ignore those reports too. */ hid_err(hdev, "%s: invalid device index:%d\n", __func__, hidpp_report->device_index); return false; } spin_lock_irqsave(&djrcv_dev->lock, flags); dj_dev = djrcv_dev->paired_dj_devices[device_index]; /* * With 27 MHz receivers, we do not get an explicit unpair event, * remove the old device if the user has paired a *different* device. */ if (djrcv_dev->type == recvr_type_27mhz && dj_dev && hidpp_report->sub_id == REPORT_TYPE_NOTIF_DEVICE_CONNECTED && hidpp_report->params[HIDPP_PARAM_PROTO_TYPE] == 0x02 && hidpp_report->params[HIDPP_PARAM_27MHZ_DEVID] != dj_dev->hdev->product) { struct dj_workitem workitem = { .device_index = hidpp_report->device_index, .type = WORKITEM_TYPE_UNPAIRED, }; kfifo_in(&djrcv_dev->notif_fifo, &workitem, sizeof(workitem)); /* logi_hidpp_recv_queue_notif will queue the work */ dj_dev = NULL; } if (dj_dev) { logi_dj_recv_forward_report(dj_dev, data, size); } else { if (hidpp_report->sub_id == REPORT_TYPE_NOTIF_DEVICE_CONNECTED) logi_hidpp_recv_queue_notif(hdev, hidpp_report); else logi_dj_recv_queue_unknown_work(djrcv_dev); } spin_unlock_irqrestore(&djrcv_dev->lock, flags); return false; } static int logi_dj_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); dbg_hid("%s, size:%d\n", __func__, size); if (!djrcv_dev) return 0; if (!hdev->report_enum[HID_INPUT_REPORT].numbered) { if (djrcv_dev->unnumbered_application == HID_GD_KEYBOARD) { /* * For the keyboard, we can reuse the same report by * using the second byte which is constant in the USB * HID report descriptor. */ data[1] = data[0]; data[0] = REPORT_TYPE_KEYBOARD; logi_dj_recv_forward_input_report(hdev, data, size); /* restore previous state */ data[0] = data[1]; data[1] = 0; } /* * Mouse-only receivers send unnumbered mouse data. The 27 MHz * receiver uses 6 byte packets, the nano receiver 8 bytes. */ if (djrcv_dev->unnumbered_application == HID_GD_MOUSE && size <= 8) { u8 mouse_report[9]; /* Prepend report id */ mouse_report[0] = REPORT_TYPE_MOUSE; memcpy(mouse_report + 1, data, size); logi_dj_recv_forward_input_report(hdev, mouse_report, size + 1); } return false; } switch (data[0]) { case REPORT_ID_DJ_SHORT: if (size != DJREPORT_SHORT_LENGTH) { hid_err(hdev, "Short DJ report bad size (%d)", size); return false; } return logi_dj_dj_event(hdev, report, data, size); case REPORT_ID_DJ_LONG: if (size != DJREPORT_LONG_LENGTH) { hid_err(hdev, "Long DJ report bad size (%d)", size); return false; } return logi_dj_dj_event(hdev, report, data, size); case REPORT_ID_HIDPP_SHORT: if (size != HIDPP_REPORT_SHORT_LENGTH) { hid_err(hdev, "Short HID++ report bad size (%d)", size); return false; } return logi_dj_hidpp_event(hdev, report, data, size); case REPORT_ID_HIDPP_LONG: if (size != HIDPP_REPORT_LONG_LENGTH) { hid_err(hdev, "Long HID++ report bad size (%d)", size); return false; } return logi_dj_hidpp_event(hdev, report, data, size); } logi_dj_recv_forward_input_report(hdev, data, size); return false; } static int logi_dj_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct hid_report_enum *rep_enum; struct hid_report *rep; struct dj_receiver_dev *djrcv_dev; struct usb_interface *intf; unsigned int no_dj_interfaces = 0; bool has_hidpp = false; unsigned long flags; int retval; /* * Call to usbhid to fetch the HID descriptors of the current * interface subsequently call to the hid/hid-core to parse the * fetched descriptors. */ retval = hid_parse(hdev); if (retval) { hid_err(hdev, "%s: parse failed\n", __func__); return retval; } /* * Some KVMs add an extra interface for e.g. mouse emulation. If we * treat these as logitech-dj interfaces then this causes input events * reported through this extra interface to not be reported correctly. * To avoid this, we treat these as generic-hid devices. */ switch (id->driver_data) { case recvr_type_dj: no_dj_interfaces = 3; break; case recvr_type_hidpp: no_dj_interfaces = 2; break; case recvr_type_gaming_hidpp: no_dj_interfaces = 3; break; case recvr_type_mouse_only: no_dj_interfaces = 2; break; case recvr_type_27mhz: no_dj_interfaces = 2; break; case recvr_type_bluetooth: no_dj_interfaces = 2; break; case recvr_type_dinovo: no_dj_interfaces = 2; break; } if (hid_is_usb(hdev)) { intf = to_usb_interface(hdev->dev.parent); if (intf && intf->altsetting->desc.bInterfaceNumber >= no_dj_interfaces) { hdev->quirks |= HID_QUIRK_INPUT_PER_APP; return hid_hw_start(hdev, HID_CONNECT_DEFAULT); } } rep_enum = &hdev->report_enum[HID_INPUT_REPORT]; /* no input reports, bail out */ if (list_empty(&rep_enum->report_list)) return -ENODEV; /* * Check for the HID++ application. * Note: we should theoretically check for HID++ and DJ * collections, but this will do. */ list_for_each_entry(rep, &rep_enum->report_list, list) { if (rep->application == 0xff000001) has_hidpp = true; } /* * Ignore interfaces without DJ/HID++ collection, they will not carry * any data, dont create any hid_device for them. */ if (!has_hidpp && id->driver_data == recvr_type_dj) return -ENODEV; /* get the current application attached to the node */ rep = list_first_entry(&rep_enum->report_list, struct hid_report, list); djrcv_dev = dj_get_receiver_dev(hdev, id->driver_data, rep->application, has_hidpp); if (!djrcv_dev) { hid_err(hdev, "%s: dj_get_receiver_dev failed\n", __func__); return -ENOMEM; } if (!rep_enum->numbered) djrcv_dev->unnumbered_application = rep->application; /* Starts the usb device and connects to upper interfaces hiddev and * hidraw */ retval = hid_hw_start(hdev, HID_CONNECT_HIDRAW|HID_CONNECT_HIDDEV); if (retval) { hid_err(hdev, "%s: hid_hw_start returned error\n", __func__); goto hid_hw_start_fail; } if (has_hidpp) { retval = logi_dj_recv_switch_to_dj_mode(djrcv_dev, 0); if (retval < 0) { hid_err(hdev, "%s: logi_dj_recv_switch_to_dj_mode returned error:%d\n", __func__, retval); goto switch_to_dj_mode_fail; } } /* This is enabling the polling urb on the IN endpoint */ retval = hid_hw_open(hdev); if (retval < 0) { hid_err(hdev, "%s: hid_hw_open returned error:%d\n", __func__, retval); goto llopen_failed; } /* Allow incoming packets to arrive: */ hid_device_io_start(hdev); if (has_hidpp) { spin_lock_irqsave(&djrcv_dev->lock, flags); djrcv_dev->ready = true; spin_unlock_irqrestore(&djrcv_dev->lock, flags); retval = logi_dj_recv_query_paired_devices(djrcv_dev); if (retval < 0) { hid_err(hdev, "%s: logi_dj_recv_query_paired_devices error:%d\n", __func__, retval); /* * This can happen with a KVM, let the probe succeed, * logi_dj_recv_queue_unknown_work will retry later. */ } } return 0; llopen_failed: switch_to_dj_mode_fail: hid_hw_stop(hdev); hid_hw_start_fail: dj_put_receiver_dev(hdev); return retval; } #ifdef CONFIG_PM static int logi_dj_reset_resume(struct hid_device *hdev) { int retval; struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); if (!djrcv_dev || djrcv_dev->hidpp != hdev) return 0; retval = logi_dj_recv_switch_to_dj_mode(djrcv_dev, 0); if (retval < 0) { hid_err(hdev, "%s: logi_dj_recv_switch_to_dj_mode returned error:%d\n", __func__, retval); } return 0; } #endif static void logi_dj_remove(struct hid_device *hdev) { struct dj_receiver_dev *djrcv_dev = hid_get_drvdata(hdev); struct dj_device *dj_dev; unsigned long flags; int i; dbg_hid("%s\n", __func__); if (!djrcv_dev) return hid_hw_stop(hdev); /* * This ensures that if the work gets requeued from another * interface of the same receiver it will be a no-op. */ spin_lock_irqsave(&djrcv_dev->lock, flags); djrcv_dev->ready = false; spin_unlock_irqrestore(&djrcv_dev->lock, flags); cancel_work_sync(&djrcv_dev->work); hid_hw_close(hdev); hid_hw_stop(hdev); /* * For proper operation we need access to all interfaces, so we destroy * the paired devices when we're unbound from any interface. * * Note we may still be bound to other interfaces, sharing the same * djrcv_dev, so we need locking here. */ for (i = 0; i < (DJ_MAX_PAIRED_DEVICES + DJ_DEVICE_INDEX_MIN); i++) { spin_lock_irqsave(&djrcv_dev->lock, flags); dj_dev = djrcv_dev->paired_dj_devices[i]; djrcv_dev->paired_dj_devices[i] = NULL; spin_unlock_irqrestore(&djrcv_dev->lock, flags); if (dj_dev != NULL) { hid_destroy_device(dj_dev->hdev); kfree(dj_dev); } } dj_put_receiver_dev(hdev); } static const struct hid_device_id logi_dj_receivers[] = { { /* Logitech unifying receiver (0xc52b) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER), .driver_data = recvr_type_dj}, { /* Logitech unifying receiver (0xc532) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_UNIFYING_RECEIVER_2), .driver_data = recvr_type_dj}, { /* Logitech Nano mouse only receiver (0xc52f) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER), .driver_data = recvr_type_mouse_only}, { /* Logitech Nano (non DJ) receiver (0xc534) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_2), .driver_data = recvr_type_hidpp}, { /* Logitech G700(s) receiver (0xc531) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_G700_RECEIVER), .driver_data = recvr_type_gaming_hidpp}, { /* Logitech G602 receiver (0xc537) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 0xc537), .driver_data = recvr_type_gaming_hidpp}, { /* Logitech lightspeed receiver (0xc539) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1), .driver_data = recvr_type_gaming_hidpp}, { /* Logitech powerplay receiver (0xc53a) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_POWERPLAY), .driver_data = recvr_type_gaming_hidpp}, { /* Logitech lightspeed receiver (0xc53f) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_NANO_RECEIVER_LIGHTSPEED_1_1), .driver_data = recvr_type_gaming_hidpp}, { /* Logitech 27 MHz HID++ 1.0 receiver (0xc513) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX3000_RECEIVER), .driver_data = recvr_type_27mhz}, { /* Logitech 27 MHz HID++ 1.0 receiver (0xc517) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_S510_RECEIVER_2), .driver_data = recvr_type_27mhz}, { /* Logitech 27 MHz HID++ 1.0 mouse-only receiver (0xc51b) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_LOGITECH_27MHZ_MOUSE_RECEIVER), .driver_data = recvr_type_27mhz}, { /* Logitech MX5000 HID++ / bluetooth receiver keyboard intf. (0xc70e) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX5000_RECEIVER_KBD_DEV), .driver_data = recvr_type_bluetooth}, { /* Logitech MX5000 HID++ / bluetooth receiver mouse intf. (0xc70a) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX5000_RECEIVER_MOUSE_DEV), .driver_data = recvr_type_bluetooth}, { /* Logitech MX5500 HID++ / bluetooth receiver keyboard intf. (0xc71b) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX5500_RECEIVER_KBD_DEV), .driver_data = recvr_type_bluetooth}, { /* Logitech MX5500 HID++ / bluetooth receiver mouse intf. (0xc71c) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_MX5500_RECEIVER_MOUSE_DEV), .driver_data = recvr_type_bluetooth}, { /* Logitech Dinovo Edge HID++ / bluetooth receiver keyboard intf. (0xc713) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_EDGE_RECEIVER_KBD_DEV), .driver_data = recvr_type_dinovo}, { /* Logitech Dinovo Edge HID++ / bluetooth receiver mouse intf. (0xc714) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_EDGE_RECEIVER_MOUSE_DEV), .driver_data = recvr_type_dinovo}, { /* Logitech DiNovo Mini HID++ / bluetooth receiver mouse intf. (0xc71e) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_MINI_RECEIVER_KBD_DEV), .driver_data = recvr_type_dinovo}, { /* Logitech DiNovo Mini HID++ / bluetooth receiver keyboard intf. (0xc71f) */ HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, USB_DEVICE_ID_DINOVO_MINI_RECEIVER_MOUSE_DEV), .driver_data = recvr_type_dinovo}, {} }; MODULE_DEVICE_TABLE(hid, logi_dj_receivers); static struct hid_driver logi_djreceiver_driver = { .name = "logitech-djreceiver", .id_table = logi_dj_receivers, .probe = logi_dj_probe, .remove = logi_dj_remove, .raw_event = logi_dj_raw_event, #ifdef CONFIG_PM .reset_resume = logi_dj_reset_resume, #endif }; module_hid_driver(logi_djreceiver_driver); MODULE_DESCRIPTION("HID driver for Logitech receivers"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Logitech"); MODULE_AUTHOR("Nestor Lopez Casado"); MODULE_AUTHOR("nlopezcasad@logitech.com");
55 64 50 5 7 21 35 61 4 2 2 20 20 17 3 7 7 23 7 53 53 53 35 35 32 8 34 14 33 13 27 16 13 2 7 5 26 27 12 18 27 23 23 4 4 4 13 7 1 4 1 1 1 1 3 3 2 1 8 1 3 8 2 5 14 1 3 11 1 23 5 1 1 5 6 1 1 1 1 75 36 39 2 2 1 1 1 1 2 2 123 1 4 1 1 1 1 2 1 1 1 56 63 2 155 13 140 26 23 4 4 3 31 31 1 31 28 10 28 7 3 21 12 1 11 11 8 8 1 1 1 4 1 1 1 3 1 3 2 2 6 11 1 1 10 4 36 17 2 30 3 3 3 13 1 13 16 36 36 36 10 35 36 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 // SPDX-License-Identifier: GPL-2.0-only /* * net/dccp/proto.c * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #include <linux/dccp.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/random.h> #include <linux/slab.h> #include <net/checksum.h> #include <net/inet_sock.h> #include <net/inet_common.h> #include <net/sock.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/delay.h> #include <linux/poll.h> #include "ccid.h" #include "dccp.h" #include "feat.h" #define CREATE_TRACE_POINTS #include "trace.h" DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); DEFINE_PER_CPU(unsigned int, dccp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ int sysctl_dccp_tx_qlen __read_mostly = 5; #ifdef CONFIG_IP_DCCP_DEBUG static const char *dccp_state_name(const int state) { static const char *const dccp_state_names[] = { [DCCP_OPEN] = "OPEN", [DCCP_REQUESTING] = "REQUESTING", [DCCP_PARTOPEN] = "PARTOPEN", [DCCP_LISTEN] = "LISTEN", [DCCP_RESPOND] = "RESPOND", [DCCP_CLOSING] = "CLOSING", [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", [DCCP_TIME_WAIT] = "TIME_WAIT", [DCCP_CLOSED] = "CLOSED", }; if (state >= DCCP_MAX_STATES) return "INVALID STATE!"; else return dccp_state_names[state]; } #endif void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, dccp_state_name(oldstate), dccp_state_name(state)); WARN_ON(state == oldstate); switch (state) { case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); /* Client retransmits all Confirm options until entering OPEN */ if (oldstate == DCCP_PARTOPEN) dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || oldstate == DCCP_CLOSING) DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash != NULL && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == DCCP_OPEN) DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_set_state(sk, state); } EXPORT_SYMBOL_GPL(dccp_set_state); static void dccp_finish_passive_close(struct sock *sk) { switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: /* Node (client or server) has received Close packet. */ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); dccp_set_state(sk, DCCP_CLOSED); break; case DCCP_PASSIVE_CLOSEREQ: /* * Client received CloseReq. We set the `active' flag so that * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. */ dccp_send_close(sk, 1); dccp_set_state(sk, DCCP_CLOSING); } } void dccp_done(struct sock *sk) { dccp_set_state(sk, DCCP_CLOSED); dccp_clear_xmit_timers(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(dccp_done); const char *dccp_packet_name(const int type) { static const char *const dccp_packet_names[] = { [DCCP_PKT_REQUEST] = "REQUEST", [DCCP_PKT_RESPONSE] = "RESPONSE", [DCCP_PKT_DATA] = "DATA", [DCCP_PKT_ACK] = "ACK", [DCCP_PKT_DATAACK] = "DATAACK", [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", [DCCP_PKT_CLOSE] = "CLOSE", [DCCP_PKT_RESET] = "RESET", [DCCP_PKT_SYNC] = "SYNC", [DCCP_PKT_SYNCACK] = "SYNCACK", }; if (type >= DCCP_NR_PKT_TYPES) return "INVALID"; else return dccp_packet_names[type]; } EXPORT_SYMBOL_GPL(dccp_packet_name); void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; } EXPORT_SYMBOL_GPL(dccp_destruct_common); static void dccp_sk_destruct(struct sock *sk) { dccp_destruct_common(sk); inet_sock_destruct(sk); } int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " "please contact the netdev mailing list\n"); icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); INIT_LIST_HEAD(&dp->dccps_featneg); /* control socket doesn't need feat nego */ if (likely(ctl_sock_initialized)) return dccp_feat_init(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } /* Clean up a referenced DCCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash != NULL) inet_put_port(sk); kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); static inline int dccp_need_reset(int state) { return state != DCCP_CLOSED && state != DCCP_LISTEN && state != DCCP_REQUESTING; } int dccp_disconnect(struct sock *sk, int flags) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); const int old_state = sk->sk_state; if (old_state != DCCP_CLOSED) dccp_set_state(sk, DCCP_CLOSED); /* * This corresponds to the ABORT function of RFC793, sec. 3.8 * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". */ if (old_state == DCCP_LISTEN) { inet_csk_listen_stop(sk); } else if (dccp_need_reset(old_state)) { dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); sk->sk_err = ECONNRESET; } else if (old_state == DCCP_REQUESTING) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = NULL; __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { __kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; } inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); icsk->icsk_backoff = 0; inet_csk_delack_init(sk); __sk_dst_reset(sk); WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk_error_report(sk); return 0; } EXPORT_SYMBOL_GPL(dccp_disconnect); /* * Wait for a DCCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask; u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == DCCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes made by another threads is impossible in any case. */ mask = 0; if (READ_ONCE(sk->sk_err)) mask = EPOLLERR; shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected? */ if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { if (atomic_read(&sk->sk_rmem_alloc) > 0) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (sk_stream_is_writeable(sk)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. */ if (sk_stream_is_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; } } } return mask; } EXPORT_SYMBOL_GPL(dccp_poll); int dccp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) goto out; switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and * always 0, comparably to UDP. */ rc = 0; } break; case SIOCINQ: { struct sk_buff *skb; *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* * We will only return the amount of this packet since * that is all that will be read. */ *karg = skb->len; } rc = 0; } break; default: rc = -ENOIOCTLCMD; break; } out: release_sock(sk); return rc; } EXPORT_SYMBOL_GPL(dccp_ioctl); static int dccp_setsockopt_service(struct sock *sk, const __be32 service, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_service_list *sl = NULL; if (service == DCCP_SERVICE_INVALID_VALUE || optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) return -EINVAL; if (optlen > sizeof(service)) { sl = kmalloc(optlen, GFP_KERNEL); if (sl == NULL) return -ENOMEM; sl->dccpsl_nr = optlen / sizeof(u32) - 1; if (copy_from_sockptr_offset(sl->dccpsl_list, optval, sizeof(service), optlen - sizeof(service)) || dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { kfree(sl); return -EFAULT; } } lock_sock(sk); dp->dccps_service = service; kfree(dp->dccps_service_list); dp->dccps_service_list = sl; release_sock(sk); return 0; } static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) { u8 *list, len; int i, rc; if (cscov < 0 || cscov > 15) return -EINVAL; /* * Populate a list of permissible values, in the range cscov...15. This * is necessary since feature negotiation of single values only works if * both sides incidentally choose the same value. Since the list starts * lowest-value first, negotiation will pick the smallest shared value. */ if (cscov == 0) return 0; len = 16 - cscov; list = kmalloc(len, GFP_KERNEL); if (list == NULL) return -ENOBUFS; for (i = 0; i < len; i++) list[i] = cscov++; rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); if (rc == 0) { if (rx) dccp_sk(sk)->dccps_pcrlen = cscov; else dccp_sk(sk)->dccps_pcslen = cscov; } kfree(list); return rc; } static int dccp_setsockopt_ccid(struct sock *sk, int type, sockptr_t optval, unsigned int optlen) { u8 *val; int rc = 0; if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; val = memdup_sockptr(optval, optlen); if (IS_ERR(val)) return PTR_ERR(val); lock_sock(sk); if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); release_sock(sk); kfree(val); return rc; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CHANGE_L: case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_CCID: case DCCP_SOCKOPT_RX_CCID: case DCCP_SOCKOPT_TX_CCID: return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; if (optname == DCCP_SOCKOPT_SERVICE) return dccp_setsockopt_service(sk, val, optval, optlen); lock_sock(sk); switch (optname) { case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; case DCCP_SOCKOPT_SEND_CSCOV: err = dccp_setsockopt_cscov(sk, val, false); break; case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; case DCCP_SOCKOPT_QPOLICY_ID: if (sk->sk_state != DCCP_CLOSED) err = -EISCONN; else if (val < 0 || val >= DCCPQ_POLICY_MAX) err = -EINVAL; else dp->dccps_qpolicy = val; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: if (val < 0) err = -EINVAL; else dp->dccps_tx_qlen = val; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, optname, optval, optlen); return do_dccp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_setsockopt); static int dccp_getsockopt_service(struct sock *sk, int len, __be32 __user *optval, int __user *optlen) { const struct dccp_sock *dp = dccp_sk(sk); const struct dccp_service_list *sl; int err = -ENOENT, slen = 0, total_len = sizeof(u32); lock_sock(sk); if ((sl = dp->dccps_service_list) != NULL) { slen = sl->dccpsl_nr * sizeof(u32); total_len += slen; } err = -EINVAL; if (total_len > len) goto out; err = 0; if (put_user(total_len, optlen) || put_user(dp->dccps_service, optval) || (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) err = -EFAULT; out: release_sock(sk); return err; } static int do_dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dccp_sock *dp; int val, len; if (get_user(len, optlen)) return -EFAULT; if (len < (int)sizeof(int)) return -EINVAL; dp = dccp_sk(sk); switch (optname) { case DCCP_SOCKOPT_PACKET_SIZE: DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); return 0; case DCCP_SOCKOPT_SERVICE: return dccp_getsockopt_service(sk, len, (__be32 __user *)optval, optlen); case DCCP_SOCKOPT_GET_CUR_MPS: val = READ_ONCE(dp->dccps_mss_cache); break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_TX_CCID: val = ccid_get_current_tx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_RX_CCID: val = ccid_get_current_rx_ccid(dp); if (val < 0) return -ENOPROTOOPT; break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; case DCCP_SOCKOPT_SEND_CSCOV: val = dp->dccps_pcslen; break; case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; case DCCP_SOCKOPT_QPOLICY_ID: val = dp->dccps_qpolicy; break; case DCCP_SOCKOPT_QPOLICY_TXQLEN: val = dp->dccps_tx_qlen; break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); case 192 ... 255: return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, len, (u32 __user *)optval, optlen); default: return -ENOPROTOOPT; } len = sizeof(val); if (put_user(len, optlen) || copy_to_user(optval, &val, len)) return -EFAULT; return 0; } int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level != SOL_DCCP) return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, optname, optval, optlen); return do_dccp_getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL_GPL(dccp_getsockopt); static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) { struct cmsghdr *cmsg; /* * Assign an (opaque) qpolicy priority value to skb->priority. * * We are overloading this skb field for use with the qpolicy subystem. * The skb->priority is normally used for the SO_PRIORITY option, which * is initialised from sk_priority. Since the assignment of sk_priority * to skb->priority happens later (on layer 3), we overload this field * for use with queueing priorities as long as the skb is on layer 4. * The default priority value (if nothing is set) is 0. */ skb->priority = 0; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_DCCP) continue; if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) return -EINVAL; switch (cmsg->cmsg_type) { case DCCP_SCM_PRIORITY: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) return -EINVAL; skb->priority = *(__u32 *)CMSG_DATA(cmsg); break; default: return -EINVAL; } } return 0; } int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { const struct dccp_sock *dp = dccp_sk(sk); const int flags = msg->msg_flags; const int noblock = flags & MSG_DONTWAIT; struct sk_buff *skb; int rc, size; long timeo; trace_dccp_probe(sk, len); if (len > READ_ONCE(dp->dccps_mss_cache)) return -EMSGSIZE; lock_sock(sk); timeo = sock_sndtimeo(sk, noblock); /* * We have to use sk_stream_wait_connect here to set sk_write_pending, * so that the trick in dccp_rcv_request_sent_state_process. */ /* Wait for a connection to finish. */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) goto out_release; size = sk->sk_prot->max_header + len; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (skb == NULL) goto out_release; if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_discard; } if (sk->sk_state == DCCP_CLOSED) { rc = -ENOTCONN; goto out_discard; } /* We need to check dccps_mss_cache after socket is locked. */ if (len > dp->dccps_mss_cache) { rc = -EMSGSIZE; goto out_discard; } skb_reserve(skb, sk->sk_prot->max_header); rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc != 0) goto out_discard; rc = dccp_msghdr_parse(msg, skb); if (rc != 0) goto out_discard; dccp_qpolicy_push(sk, skb); /* * The xmit_timer is set if the TX CCID is rate-based and will expire * when congestion control permits to release further packets into the * network. Window-based CCIDs do not use this timer. */ if (!timer_pending(&dp->dccps_xmit_timer)) dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; out_discard: kfree_skb(skb); goto out_release; } EXPORT_SYMBOL_GPL(dccp_sendmsg); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { const struct dccp_hdr *dh; long timeo; lock_sock(sk); if (sk->sk_state == DCCP_LISTEN) { len = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) goto verify_sock_status; dh = dccp_hdr(skb); switch (dh->dccph_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: goto found_ok_skb; case DCCP_PKT_CLOSE: case DCCP_PKT_CLOSEREQ: if (!(flags & MSG_PEEK)) dccp_finish_passive_close(sk); fallthrough; case DCCP_PKT_RESET: dccp_pr_debug("found fin (%s) ok!\n", dccp_packet_name(dh->dccph_type)); len = 0; goto found_fin_ok; default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; break; } if (sk->sk_err) { len = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { len = 0; break; } if (sk->sk_state == DCCP_CLOSED) { if (!sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read * from never connected socket. */ len = -ENOTCONN; break; } len = 0; break; } if (!timeo) { len = -EAGAIN; break; } if (signal_pending(current)) { len = sock_intr_errno(timeo); break; } sk_wait_data(sk, &timeo, NULL); continue; found_ok_skb: if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; } if (flags & MSG_TRUNC) len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) sk_eat_skb(sk, skb); break; } while (1); out: release_sock(sk); return len; } EXPORT_SYMBOL_GPL(dccp_recvmsg); int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; int err; lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) goto out; old_state = sk->sk_state; if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) goto out; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != DCCP_LISTEN) { struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; /* do not start to listen if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dp)) { err = -EPROTO; goto out; } err = inet_csk_listen_start(sk); if (err) goto out; } err = 0; out: release_sock(sk); return err; } EXPORT_SYMBOL_GPL(inet_dccp_listen); static void dccp_terminate_connection(struct sock *sk) { u8 next_state = DCCP_CLOSED; switch (sk->sk_state) { case DCCP_PASSIVE_CLOSE: case DCCP_PASSIVE_CLOSEREQ: dccp_finish_passive_close(sk); break; case DCCP_PARTOPEN: dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); fallthrough; case DCCP_OPEN: dccp_send_close(sk, 1); if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && !dccp_sk(sk)->dccps_server_timewait) next_state = DCCP_ACTIVE_CLOSEREQ; else next_state = DCCP_CLOSING; fallthrough; default: dccp_set_state(sk, next_state); } } void dccp_close(struct sock *sk, long timeout) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; u32 data_was_unread = 0; int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == DCCP_LISTEN) { dccp_set_state(sk, DCCP_CLOSED); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } sk_stop_timer(sk, &dp->dccps_xmit_timer); /* * We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the *reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { data_was_unread += skb->len; __kfree_skb(skb); } /* If socket has been already reset kill it. */ if (sk->sk_state == DCCP_CLOSED) goto adjudge_to_death; if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_set_state(sk, DCCP_CLOSED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { /* * Normal connection termination. May need to wait if there are * still packets in the TX queue that are delayed by the CCID. */ dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } /* * Flush write queue. This may be necessary in several cases: * - we have been closed by the peer but still have application data; * - abortive termination (unread data or zero linger time), * - normal termination but queue could not be flushed within time limit */ __skb_queue_purge(&sk->sk_write_queue); sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); /* * It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); /* * Now socket is owned by kernel and we acquire BH lock * to finish close. No need to check for user refs. */ local_bh_disable(); bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); this_cpu_inc(dccp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) goto out; if (sk->sk_state == DCCP_CLOSED) inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); sock_put(sk); } EXPORT_SYMBOL_GPL(dccp_close); void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("called shutdown(%x)\n", how); } EXPORT_SYMBOL_GPL(dccp_shutdown); static inline int __init dccp_mib_init(void) { dccp_statistics = alloc_percpu(struct dccp_mib); if (!dccp_statistics) return -ENOMEM; return 0; } static inline void dccp_mib_exit(void) { free_percpu(dccp_statistics); } static int thash_entries; module_param(thash_entries, int, 0444); MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); #ifdef CONFIG_IP_DCCP_DEBUG bool dccp_debug; module_param(dccp_debug, bool, 0644); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) { unsigned long goal; unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; dccp_hashinfo.bind2_bucket_cachep = kmem_cache_create("dccp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind2_bucket_cachep) goto out_free_bind_bucket_cachep; /* * Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ if (nr_pages >= (128 * 1024)) goal = nr_pages >> (21 - PAGE_SHIFT); else goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) ; do { unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / sizeof(struct inet_ehash_bucket); while (hash_size & (hash_size - 1)) hash_size--; dccp_hashinfo.ehash_mask = hash_size - 1; dccp_hashinfo.ehash = (struct inet_ehash_bucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); } while (!dccp_hashinfo.ehash && --ehash_order > 0); if (!dccp_hashinfo.ehash) { DCCP_CRIT("Failed to allocate DCCP established hash table"); goto out_free_bind2_bucket_cachep; } for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&dccp_hashinfo)) goto out_free_dccp_ehash; bhash_order = ehash_order; do { dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / sizeof(struct inet_bind_hashbucket); if ((dccp_hashinfo.bhash_size > (64 * 1024)) && bhash_order > 0) continue; dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); } while (!dccp_hashinfo.bhash && --bhash_order >= 0); if (!dccp_hashinfo.bhash) { DCCP_CRIT("Failed to allocate DCCP bind hash table"); goto out_free_dccp_locks; } dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); if (!dccp_hashinfo.bhash2) { DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); goto out_free_dccp_bhash; } for (i = 0; i < dccp_hashinfo.bhash_size; i++) { spin_lock_init(&dccp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); spin_lock_init(&dccp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } dccp_hashinfo.pernet = false; rc = dccp_mib_init(); if (rc) goto out_free_dccp_bhash2; rc = dccp_ackvec_init(); if (rc) goto out_free_dccp_mib; rc = dccp_sysctl_init(); if (rc) goto out_ackvec_exit; rc = ccid_initialize_builtins(); if (rc) goto out_sysctl_exit; dccp_timestamping_init(); return 0; out_sysctl_exit: dccp_sysctl_exit(); out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: dccp_mib_exit(); out_free_dccp_bhash2: free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); out_free_dccp_bhash: free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); out_free_dccp_locks: inet_ehash_locks_free(&dccp_hashinfo); out_free_dccp_ehash: free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); out_free_bind2_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); out_fail: dccp_hashinfo.bhash = NULL; dccp_hashinfo.bhash2 = NULL; dccp_hashinfo.ehash = NULL; dccp_hashinfo.bind_bucket_cachep = NULL; dccp_hashinfo.bind2_bucket_cachep = NULL; return rc; } static void __exit dccp_fini(void) { int bhash_order = get_order(dccp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); ccid_cleanup_builtins(); dccp_mib_exit(); free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); free_pages((unsigned long)dccp_hashinfo.ehash, get_order((dccp_hashinfo.ehash_mask + 1) * sizeof(struct inet_ehash_bucket))); inet_ehash_locks_free(&dccp_hashinfo); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); inet_hashinfo2_free_mod(&dccp_hashinfo); } module_init(dccp_init); module_exit(dccp_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
350 6806 1876 396 3976 263 3 148 802 42 825 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_DCACHE_H #define __LINUX_DCACHE_H #include <linux/atomic.h> #include <linux/list.h> #include <linux/math.h> #include <linux/rculist.h> #include <linux/rculist_bl.h> #include <linux/spinlock.h> #include <linux/seqlock.h> #include <linux/cache.h> #include <linux/rcupdate.h> #include <linux/lockref.h> #include <linux/stringhash.h> #include <linux/wait.h> struct path; struct file; struct vfsmount; /* * linux/include/linux/dcache.h * * Dirent cache data structures * * (C) Copyright 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ #define IS_ROOT(x) ((x) == (x)->d_parent) /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* * "quick string" -- eases parameter passing, but more importantly * saves "metadata" about the string (ie length and the hash). * * hash comes first so it snuggles against d_parent in the * dentry. */ struct qstr { union { struct { HASH_LEN_DECLARE; }; u64 hash_len; }; const unsigned char *name; }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the * large memory footprint increase). */ #ifdef CONFIG_64BIT # define DNAME_INLINE_LEN 40 /* 192 bytes */ #else # ifdef CONFIG_SMP # define DNAME_INLINE_LEN 36 /* 128 bytes */ # else # define DNAME_INLINE_LEN 44 /* 128 bytes */ # endif #endif #define d_lock d_lockref.lock struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ /* --- cacheline 2 boundary (128 bytes) --- */ struct lockref d_lockref; /* per-dentry lock and refcount * keep separate from RCU lookup area if * possible! */ union { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* * d_alias and d_rcu can share memory */ union { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; }; /* * dentry->d_lock spinlock nesting subclasses: * * 0: normal * 1: nested */ enum dentry_d_lock_class { DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ DENTRY_D_LOCK_NESTED }; enum d_real_type { D_REAL_DATA, D_REAL_METADATA, }; struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); } ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/locking.rst. Keep it updated! * * FUrther descriptions are found in Documentation/filesystems/vfs.rst. * Keep it updated too! */ /* d_flags entries */ #define DCACHE_OP_HASH BIT(0) #define DCACHE_OP_COMPARE BIT(1) #define DCACHE_OP_REVALIDATE BIT(2) #define DCACHE_OP_DELETE BIT(3) #define DCACHE_OP_PRUNE BIT(4) #define DCACHE_DISCONNECTED BIT(5) /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first * endeavour to clear the bit either by discovering that it is connected, * or by performing lookup operations. Any filesystem which supports * nfsd_operations MUST have a lookup function which, if it finds a * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ #define DCACHE_REFERENCED BIT(6) /* Recently used, don't discard. */ #define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT BIT(8) #define DCACHE_GENOCIDE BIT(9) #define DCACHE_SHRINK_LIST BIT(10) #define DCACHE_OP_WEAK_REVALIDATE BIT(11) #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ #define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) /* Parent inode is watched by some fsnotify listener */ #define DCACHE_DENTRY_KILLED BIT(15) #define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ #define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ #define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) #define DCACHE_LRU_LIST BIT(19) #define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ #define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ #define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ #define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ #define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ #define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ #define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ #define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ #define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ #define DCACHE_OP_REAL BIT(26) #define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ #define DCACHE_DENTRY_CURSOR BIT(29) #define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ extern seqlock_t rename_lock; /* * These are the low-level FS interfaces to the dcache.. */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); extern void d_mark_tmpfile(struct file *, struct inode *); extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); extern struct dentry *d_find_alias_rcu(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. */ extern void d_rehash(struct dentry *); extern void d_add(struct dentry *, struct inode *); /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; } ino_t d_parent_ino(struct dentry *dentry); /* * helper function for dentry_operations.d_dname() members */ extern __printf(3, 4) char *dynamic_dname(char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path_raw(const struct dentry *, char *, int); extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ /** * dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a live dentry, increment the reference count and return the dentry. * Caller must hold @dentry->d_lock. Making sure that dentry is alive is * caller's resonsibility. There are many conditions sufficient to guarantee * that; e.g. anything with non-negative refcount is alive, so's anything * hashed, anything positive, anyone's parent, etc. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { dentry->d_lockref.count++; return dentry; } /** * dget - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be * destroyed when it has references. Conversely, a dentry with * no references can disappear for any number of reasons, starting * with memory pressure. In other words, that primitive is * used to clone an existing reference; using it on something with * zero refcount is a bug. * * NOTE: it will spin if @dentry->d_lock is held. From the deadlock * avoidance point of view it is equivalent to spin_lock()/increment * refcount/spin_unlock(), so calling it under @dentry->d_lock is * always a bug; so's calling it under ->d_lock on any of its descendents. * */ static inline struct dentry *dget(struct dentry *dentry) { if (dentry) lockref_get(&dentry->d_lockref); return dentry; } extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed * @dentry: entry to check * * Returns true if the dentry passed is not currently hashed. */ static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); } static inline int d_unlinked(const struct dentry *dentry) { return d_unhashed(dentry) && !IS_ROOT(dentry); } static inline int cant_mount(const struct dentry *dentry) { return (dentry->d_flags & DCACHE_CANT_MOUNT); } static inline void dont_mount(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_CANT_MOUNT; spin_unlock(&dentry->d_lock); } extern void __d_lookup_unhash_wake(struct dentry *dentry); static inline int d_in_lookup(const struct dentry *dentry) { return dentry->d_flags & DCACHE_PAR_LOOKUP; } static inline void d_lookup_done(struct dentry *dentry) { if (unlikely(d_in_lookup(dentry))) __d_lookup_unhash_wake(dentry); } extern void dput(struct dentry *); static inline bool d_managed(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MANAGED_DENTRY; } static inline bool d_mountpoint(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } /* * Directory cache entry type accessor functions. */ static inline unsigned __d_entry_type(const struct dentry *dentry) { return dentry->d_flags & DCACHE_ENTRY_TYPE; } static inline bool d_is_miss(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_MISS_TYPE; } static inline bool d_is_whiteout(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE; } static inline bool d_can_lookup(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE; } static inline bool d_is_autodir(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE; } static inline bool d_is_dir(const struct dentry *dentry) { return d_can_lookup(dentry) || d_is_autodir(dentry); } static inline bool d_is_symlink(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE; } static inline bool d_is_reg(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE; } static inline bool d_is_special(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE; } static inline bool d_is_file(const struct dentry *dentry) { return d_is_reg(dentry) || d_is_special(dentry); } static inline bool d_is_negative(const struct dentry *dentry) { // TODO: check d_is_whiteout(dentry) also. return d_is_miss(dentry); } static inline bool d_flags_negative(unsigned flags) { return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE; } static inline bool d_is_positive(const struct dentry *dentry) { return !d_is_negative(dentry); } /** * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents either an absent name or a name that * doesn't map to an inode (ie. ->d_inode is NULL). The dentry could represent * a true miss, a whiteout that isn't represented by a 0,0 chardev or a * fallthrough marker in an opaque directory. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. (3) The dentry may have something attached to ->d_lower and the * type field of the flags may be set to something other than miss or whiteout. */ static inline bool d_really_is_negative(const struct dentry *dentry) { return dentry->d_inode == NULL; } /** * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents a name that maps to an inode * (ie. ->d_inode is not NULL). The dentry might still represent a whiteout if * that is represented on medium as a 0,0 chardev. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. */ static inline bool d_really_is_positive(const struct dentry *dentry) { return dentry->d_inode != NULL; } static inline int simple_positive(const struct dentry *dentry) { return d_really_is_positive(dentry) && !d_unhashed(dentry); } extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) { return mult_frac(val, sysctl_vfs_cache_pressure, 100); } /** * d_inode - Get the actual inode of this dentry * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode(const struct dentry *dentry) { return dentry->d_inode; } /** * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE() * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode_rcu(const struct dentry *dentry) { return READ_ONCE(dentry->d_inode); } /** * d_backing_inode - Get upper or lower inode we should be using * @upper: The upper layer * * This is the helper that should be used to get at the inode that will be used * if this dentry were to be opened as a file. The inode may be on the upper * dentry or it may be on a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own inodes. */ static inline struct inode *d_backing_inode(const struct dentry *upper) { struct inode *inode = upper->d_inode; return inode; } /** * d_real - Return the real dentry * @dentry: the dentry to query * @type: the type of real dentry (data or metadata) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) return dentry->d_op->d_real(dentry, type); else return dentry; } /** * d_real_inode - Return the real inode hosting the data * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. * Otherwise return d_inode(). */ static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA)); } struct name_snapshot { struct qstr name; unsigned char inline_name[DNAME_INLINE_LEN]; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); static inline struct dentry *d_first_child(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib); } static inline struct dentry *d_next_sibling(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); } #endif /* __LINUX_DCACHE_H */
46 2119 1120 4898 5016 5024 9 93 7285 7286 7285 7150 7150 7144 121 2045 46 45 4479 2152 2345 3120 1372 3410 1089 3859 859 3334 3328 3334 1929 1929 1928 52 52 3777 3788 3772 3690 85 46 1977 2876 3480 485 3783 3777 3770 11 3791 2716 53615 54186 2121 2723 2048 2055 2038 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/file_table.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/task_work.h> #include <linux/swap.h> #include <linux/kmemleak.h> #include <linux/atomic.h> #include "internal.h" /* sysctl tunables... */ static struct files_stat_struct files_stat = { .max_files = NR_FILE }; /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; union { struct path user_path; freeptr_t bf_freeptr; }; }; static inline struct backing_file *backing_file(struct file *f) { return container_of(f, struct backing_file, file); } struct path *backing_file_user_path(struct file *f) { return &backing_file(f)->user_path; } EXPORT_SYMBOL_GPL(backing_file_user_path); static inline void file_free(struct file *f) { security_file_free(f); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } } /* * Return the total number of open files in the system */ static long get_nr_files(void) { return percpu_counter_read_positive(&nr_files); } /* * Return the maximum number of open files in the system */ unsigned long get_max_files(void) { return files_stat.max_files; } EXPORT_SYMBOL_GPL(get_max_files); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Handle nr_files sysctl */ static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static struct ctl_table fs_stat_sysctls[] = { { .procname = "file-nr", .data = &files_stat, .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = SYSCTL_LONG_ZERO, .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, }; static int __init init_fs_stat_sysctls(void) { register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } return 0; } fs_initcall(init_fs_stat_sysctls); #endif static int init_file(struct file *f, int flags, const struct cred *cred) { int error; f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { put_cred(f->f_cred); return error; } spin_lock_init(&f->f_lock); /* * Note that f_pos_lock is only used for files raising * FMODE_ATOMIC_POS and directories. Other files such as pipes * don't need it and since f_pos_lock is in a union may reuse * the space for other purposes. They are expected to initialize * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); memset(&f->f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); f->f_op = NULL; f->f_mapping = NULL; f->private_data = NULL; f->f_inode = NULL; f->f_owner = NULL; #ifdef CONFIG_EPOLL f->f_ep = NULL; #endif f->f_iocb_flags = 0; f->f_pos = 0; f->f_wb_err = 0; f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); return 0; } /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. * * Be very careful using this. You are responsible for * getting write access to any mount that you might assign * to this filp, if it is opened for write. If this is not * done, you will imbalance int the mount's writer count * and a warning at __fput() time. */ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; int error; /* * Privileged users can go above max_files */ if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. */ if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) goto over; } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } percpu_counter_inc(&nr_files); return f; over: /* Ran out of filps - report that */ if (get_nr_files() > old_max) { pr_info("VFS: file-max limit %lu reached\n", get_max_files()); old_max = get_nr_files(); } return ERR_PTR(-ENFILE); } /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { struct file *f; int error; f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); error = init_file(f, flags, cred); if (unlikely(error)) { kmem_cache_free(filp_cachep, f); return ERR_PTR(error); } f->f_mode |= FMODE_NOACCOUNT; return f; } /* * Variant of alloc_empty_file() that allocates a backing_file container * and doesn't check and modify nr_files. * * This is only for kernel internal use, and the allocate file must not be * installed into file tables or such. */ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) { struct backing_file *ff; int error; ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; return &ff->file; } /** * file_init_path - initialize a 'struct file' based on path * * @file: the file to set up * @path: the (dentry, vfsmount) pair for the new file * @fop: the 'struct file_operations' for the new file */ static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); file->f_sb_err = file_sample_sb_err(file); if (fop->llseek) file->f_mode |= FMODE_LSEEK; if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; if ((file->f_mode & FMODE_WRITE) && likely(fop->write || fop->write_iter)) file->f_mode |= FMODE_CAN_WRITE; file->f_iocb_flags = iocb_flags(file); file->f_mode |= FMODE_OPENED; file->f_op = fop; if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); } /** * alloc_file - allocate and initialize a 'struct file' * * @path: the (dentry, vfsmount) pair for the new file * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ static struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; file = alloc_empty_file(flags, current_cred()); if (!IS_ERR(file)) file_init_path(file, path, fop); return file; } static inline int alloc_path_pseudo(const char *name, struct inode *inode, struct vfsmount *mnt, struct path *path) { struct qstr this = QSTR_INIT(name, strlen(name)); path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this); if (!path->dentry) return -ENOMEM; path->mnt = mntget(mnt); d_instantiate(path->dentry, inode); return 0; } struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_file(&path, flags, fops); if (IS_ERR(file)) { ihold(inode); path_put(&path); } return file; } EXPORT_SYMBOL(alloc_file_pseudo); struct file *alloc_file_pseudo_noaccount(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, const struct file_operations *fops) { int ret; struct path path; struct file *file; ret = alloc_path_pseudo(name, inode, mnt, &path); if (ret) return ERR_PTR(ret); file = alloc_empty_file_noaccount(flags, current_cred()); if (IS_ERR(file)) { ihold(inode); path_put(&path); return file; } file_init_path(file, &path, fops); return file; } EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { struct file *f; f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; } return f; } /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; might_sleep(); fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. */ eventpoll_release(file); locks_remove_file(file); security_file_release(file); if (unlikely(file->f_flags & FASYNC)) { if (file->f_op->fasync) file->f_op->fasync(-1, file, 0); } if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); } static LLIST_HEAD(delayed_fput_list); static void delayed_fput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_fput_list); struct file *f, *t; llist_for_each_entry_safe(f, t, node, f_llist) __fput(f); } static void ____fput(struct callback_head *work) { __fput(container_of(work, struct file, f_task_work)); } static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); /* * If kernel thread really needs to have the final fput() it has done * to complete, call this. The only user right now is the boot - we * *do* need to make sure our writes to binaries on initramfs has * not left us with opened struct file waiting for __fput() - execve() * won't work without that. Please, don't add more callers without * very good reasons; in particular, never call that with locks * held and never call that from a thread that might need to do * some work on any kind of umount. */ void flush_delayed_fput(void) { delayed_fput(NULL); flush_delayed_work(&delayed_fput_work); } EXPORT_SYMBOL_GPL(flush_delayed_fput); void fput(struct file *file) { if (file_ref_put(&file->f_ref)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { file_free(file); return; } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_task_work, ____fput); if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } if (llist_add(&file->f_llist, &delayed_fput_list)) schedule_delayed_work(&delayed_fput_work, 1); } } /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without * risking deadlocks), need to wait for completion of __fput() and know * for this specific struct file it won't involve anything that would * need them. Use only if you really need it - at the very least, * don't blindly convert fput() by kernel thread to that. */ void __fput_sync(struct file *file) { if (file_ref_put(&file->f_ref)) __fput(file); } EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct file, f_freeptr), }; filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } /* * One file with associated inode and dcache is very roughly 1K. Per default * do not use more than 10% of our memory for files. */ void __init files_maxfiles_init(void) { unsigned long n; unsigned long nr_pages = totalram_pages(); unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; memreserve = min(memreserve, nr_pages - 1); n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); }
3 3 2 1 2 2 2 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct rss_req_info { struct ethnl_req_info base; u32 rss_context; }; struct rss_reply_data { struct ethnl_reply_data base; bool no_key_fields; u32 indir_size; u32 hkey_size; u32 hfunc; u32 input_xfrm; u32 *indir_table; u8 *hkey; }; #define RSS_REQINFO(__req_base) \ container_of(__req_base, struct rss_req_info, base) #define RSS_REPDATA(__reply_base) \ container_of(__reply_base, struct rss_reply_data, base) const struct nla_policy ethnl_rss_get_policy[] = { [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 }, [ETHTOOL_A_RSS_START_CONTEXT] = { .type = NLA_U32 }, }; static int rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rss_req_info *request = RSS_REQINFO(req_info); if (tb[ETHTOOL_A_RSS_CONTEXT]) request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]); if (tb[ETHTOOL_A_RSS_START_CONTEXT]) { NL_SET_BAD_ATTR(extack, tb[ETHTOOL_A_RSS_START_CONTEXT]); return -EINVAL; } return 0; } static int rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, struct rss_reply_data *data, const struct genl_info *info) { struct ethtool_rxfh_param rxfh = {}; const struct ethtool_ops *ops; u32 total_size, indir_bytes; u8 *rss_config; int ret; ops = dev->ethtool_ops; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->indir_size = 0; data->hkey_size = 0; if (ops->get_rxfh_indir_size) data->indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) data->hkey_size = ops->get_rxfh_key_size(dev); indir_bytes = data->indir_size * sizeof(u32); total_size = indir_bytes + data->hkey_size; rss_config = kzalloc(total_size, GFP_KERNEL); if (!rss_config) { ret = -ENOMEM; goto out_ops; } if (data->indir_size) data->indir_table = (u32 *)rss_config; if (data->hkey_size) data->hkey = rss_config + indir_bytes; rxfh.indir_size = data->indir_size; rxfh.indir = data->indir_table; rxfh.key_size = data->hkey_size; rxfh.key = data->hkey; ret = ops->get_rxfh(dev, &rxfh); if (ret) goto out_ops; data->hfunc = rxfh.hfunc; data->input_xfrm = rxfh.input_xfrm; out_ops: ethnl_ops_complete(dev); return ret; } static int rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, struct rss_reply_data *data, const struct genl_info *info) { struct ethtool_rxfh_context *ctx; u32 total_size, indir_bytes; u8 *rss_config; ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); if (!ctx) return -ENOENT; data->indir_size = ctx->indir_size; data->hkey_size = ctx->key_size; data->hfunc = ctx->hfunc; data->input_xfrm = ctx->input_xfrm; indir_bytes = data->indir_size * sizeof(u32); total_size = indir_bytes + data->hkey_size; rss_config = kzalloc(total_size, GFP_KERNEL); if (!rss_config) return -ENOMEM; data->indir_table = (u32 *)rss_config; memcpy(data->indir_table, ethtool_rxfh_context_indir(ctx), indir_bytes); if (data->hkey_size) { data->hkey = rss_config + indir_bytes; memcpy(data->hkey, ethtool_rxfh_context_key(ctx), data->hkey_size); } return 0; } static int rss_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct rss_reply_data *data = RSS_REPDATA(reply_base); struct rss_req_info *request = RSS_REQINFO(req_base); struct net_device *dev = reply_base->dev; const struct ethtool_ops *ops; ops = dev->ethtool_ops; if (!ops->get_rxfh) return -EOPNOTSUPP; /* Some drivers don't handle rss_context */ if (request->rss_context) { if (!ops->cap_rss_ctx_supported && !ops->create_rxfh_context) return -EOPNOTSUPP; data->no_key_fields = !ops->rxfh_per_ctx_key; return rss_prepare_ctx(request, dev, data, info); } return rss_prepare_get(request, dev, data, info); } static int rss_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); int len; len = nla_total_size(sizeof(u32)) + /* _RSS_CONTEXT */ nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ nla_total_size(data->hkey_size); /* _RSS_HKEY */ return len; } static int rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); struct rss_req_info *request = RSS_REQINFO(req_base); if (request->rss_context && nla_put_u32(skb, ETHTOOL_A_RSS_CONTEXT, request->rss_context)) return -EMSGSIZE; if ((data->indir_size && nla_put(skb, ETHTOOL_A_RSS_INDIR, sizeof(u32) * data->indir_size, data->indir_table))) return -EMSGSIZE; if (data->no_key_fields) return 0; if ((data->hfunc && nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || (data->input_xfrm && nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || (data->hkey_size && nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))) return -EMSGSIZE; return 0; } static void rss_cleanup_data(struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); kfree(data->indir_table); } struct rss_nl_dump_ctx { unsigned long ifindex; unsigned long ctx_idx; /* User wants to only dump contexts from given ifindex */ unsigned int match_ifindex; unsigned int start_ctx; }; static struct rss_nl_dump_ctx *rss_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_CTX_FITS(struct rss_nl_dump_ctx); return (struct rss_nl_dump_ctx *)cb->ctx; } int ethnl_rss_dump_start(struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb); struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; int ret; /* Filtering by context not supported */ if (tb[ETHTOOL_A_RSS_CONTEXT]) { NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]); return -EINVAL; } if (tb[ETHTOOL_A_RSS_START_CONTEXT]) { ctx->start_ctx = nla_get_u32(tb[ETHTOOL_A_RSS_START_CONTEXT]); ctx->ctx_idx = ctx->start_ctx; } ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_RSS_HEADER], sock_net(cb->skb->sk), cb->extack, false); if (req_info.dev) { ctx->match_ifindex = req_info.dev->ifindex; ctx->ifindex = ctx->match_ifindex; ethnl_parse_header_dev_put(&req_info); req_info.dev = NULL; } return ret; } static int rss_dump_one_ctx(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, u32 rss_context) { const struct genl_info *info = genl_info_dump(cb); struct rss_reply_data data = {}; struct rss_req_info req = {}; void *ehdr; int ret; req.rss_context = rss_context; ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_RSS_GET_REPLY); if (!ehdr) return -EMSGSIZE; ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_RSS_HEADER); if (ret < 0) goto err_cancel; /* Context 0 is not currently storred or cached in the XArray */ if (!rss_context) ret = rss_prepare_get(&req, dev, &data, info); else ret = rss_prepare_ctx(&req, dev, &data, info); if (ret) goto err_cancel; ret = rss_fill_reply(skb, &req.base, &data.base); if (ret) goto err_cleanup; genlmsg_end(skb, ehdr); rss_cleanup_data(&data.base); return 0; err_cleanup: rss_cleanup_data(&data.base); err_cancel: genlmsg_cancel(skb, ehdr); return ret; } static int rss_dump_one_dev(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb); int ret; if (!dev->ethtool_ops->get_rxfh) return 0; if (!ctx->ctx_idx) { ret = rss_dump_one_ctx(skb, cb, dev, 0); if (ret) return ret; ctx->ctx_idx++; } for (; xa_find(&dev->ethtool->rss_ctx, &ctx->ctx_idx, ULONG_MAX, XA_PRESENT); ctx->ctx_idx++) { ret = rss_dump_one_ctx(skb, cb, dev, ctx->ctx_idx); if (ret) return ret; } ctx->ctx_idx = ctx->start_ctx; return 0; } int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb); struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; rtnl_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { if (ctx->match_ifindex && ctx->match_ifindex != ctx->ifindex) break; ret = rss_dump_one_dev(skb, cb, dev); if (ret) break; } rtnl_unlock(); return ret; } const struct ethnl_request_ops ethnl_rss_request_ops = { .request_cmd = ETHTOOL_MSG_RSS_GET, .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, .hdr_attr = ETHTOOL_A_RSS_HEADER, .req_info_size = sizeof(struct rss_req_info), .reply_data_size = sizeof(struct rss_reply_data), .parse_request = rss_parse_request, .prepare_data = rss_prepare_data, .reply_size = rss_reply_size, .fill_reply = rss_fill_reply, .cleanup_data = rss_cleanup_data, };
7 2 2 23 3 3 11 15 4 2 16 16 16 5 16 7 3 7 6 5 16 8 9 4 7 7 7 7 4 6 7 7 7 7 6 1 1 12 6 1 1 2 1 9 1 3 12 12 12 12 10 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/phy.h> #include "netlink.h" #include "common.h" struct strset_info { bool per_dev; bool free_strings; unsigned int count; const char (*strings)[ETH_GSTRING_LEN]; }; static const struct strset_info info_template[] = { [ETH_SS_TEST] = { .per_dev = true, }, [ETH_SS_STATS] = { .per_dev = true, }, [ETH_SS_PRIV_FLAGS] = { .per_dev = true, }, [ETH_SS_FEATURES] = { .per_dev = false, .count = ARRAY_SIZE(netdev_features_strings), .strings = netdev_features_strings, }, [ETH_SS_RSS_HASH_FUNCS] = { .per_dev = false, .count = ARRAY_SIZE(rss_hash_func_strings), .strings = rss_hash_func_strings, }, [ETH_SS_TUNABLES] = { .per_dev = false, .count = ARRAY_SIZE(tunable_strings), .strings = tunable_strings, }, [ETH_SS_PHY_STATS] = { .per_dev = true, }, [ETH_SS_PHY_TUNABLES] = { .per_dev = false, .count = ARRAY_SIZE(phy_tunable_strings), .strings = phy_tunable_strings, }, [ETH_SS_LINK_MODES] = { .per_dev = false, .count = __ETHTOOL_LINK_MODE_MASK_NBITS, .strings = link_mode_names, }, [ETH_SS_MSG_CLASSES] = { .per_dev = false, .count = NETIF_MSG_CLASS_COUNT, .strings = netif_msg_class_names, }, [ETH_SS_WOL_MODES] = { .per_dev = false, .count = WOL_MODE_COUNT, .strings = wol_mode_names, }, [ETH_SS_SOF_TIMESTAMPING] = { .per_dev = false, .count = __SOF_TIMESTAMPING_CNT, .strings = sof_timestamping_names, }, [ETH_SS_TS_TX_TYPES] = { .per_dev = false, .count = __HWTSTAMP_TX_CNT, .strings = ts_tx_type_names, }, [ETH_SS_TS_RX_FILTERS] = { .per_dev = false, .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, [ETH_SS_UDP_TUNNEL_TYPES] = { .per_dev = false, .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, .strings = udp_tunnel_type_names, }, [ETH_SS_STATS_STD] = { .per_dev = false, .count = __ETHTOOL_STATS_CNT, .strings = stats_std_names, }, [ETH_SS_STATS_ETH_PHY] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, .strings = stats_eth_phy_names, }, [ETH_SS_STATS_ETH_MAC] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, .strings = stats_eth_mac_names, }, [ETH_SS_STATS_ETH_CTRL] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, .strings = stats_eth_ctrl_names, }, [ETH_SS_STATS_RMON] = { .per_dev = false, .count = __ETHTOOL_A_STATS_RMON_CNT, .strings = stats_rmon_names, }, [ETH_SS_STATS_PHY] = { .per_dev = false, .count = __ETHTOOL_A_STATS_PHY_CNT, .strings = stats_phy_names, }, }; struct strset_req_info { struct ethnl_req_info base; u32 req_ids; bool counts_only; }; #define STRSET_REQINFO(__req_base) \ container_of(__req_base, struct strset_req_info, base) struct strset_reply_data { struct ethnl_reply_data base; struct strset_info sets[ETH_SS_COUNT]; }; #define STRSET_REPDATA(__reply_base) \ container_of(__reply_base, struct strset_reply_data, base) const struct nla_policy ethnl_strset_get_policy[] = { [ETHTOOL_A_STRSET_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED }, [ETHTOOL_A_STRSET_COUNTS_ONLY] = { .type = NLA_FLAG }, }; static const struct nla_policy get_stringset_policy[] = { [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 }, }; /** * strset_include() - test if a string set should be included in reply * @info: parsed client request * @data: pointer to request data structure * @id: id of string set to check (ETH_SS_* constants) */ static bool strset_include(const struct strset_req_info *info, const struct strset_reply_data *data, u32 id) { bool per_dev; BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids)); if (info->req_ids) return info->req_ids & (1U << id); per_dev = data->sets[id].per_dev; if (!per_dev && !data->sets[id].strings) return false; return data->base.dev ? per_dev : !per_dev; } static int strset_get_id(const struct nlattr *nest, u32 *val, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(get_stringset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(get_stringset_policy) - 1, nest, get_stringset_policy, extack); if (ret < 0) return ret; if (NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_STRINGSET_ID)) return -EINVAL; *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]); return 0; } static const struct nla_policy strset_stringsets_policy[] = { [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED }, }; static int strset_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS]; struct nlattr *attr; int rem, ret; if (!nest) return 0; ret = nla_validate_nested(nest, ARRAY_SIZE(strset_stringsets_policy) - 1, strset_stringsets_policy, extack); if (ret < 0) return ret; req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY]; nla_for_each_nested(attr, nest, rem) { u32 id; if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET, "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n", nla_type(attr))) return -EINVAL; ret = strset_get_id(attr, &id, extack); if (ret < 0) return ret; if (id >= ETH_SS_COUNT) { NL_SET_ERR_MSG_ATTR(extack, attr, "unknown string set id"); return -EOPNOTSUPP; } req_info->req_ids |= (1U << id); } return 0; } static void strset_cleanup_data(struct ethnl_reply_data *reply_base) { struct strset_reply_data *data = STRSET_REPDATA(reply_base); unsigned int i; for (i = 0; i < ETH_SS_COUNT; i++) if (data->sets[i].free_strings) { kfree(data->sets[i].strings); data->sets[i].strings = NULL; data->sets[i].free_strings = false; } } static int strset_prepare_set(struct strset_info *info, struct net_device *dev, struct phy_device *phydev, unsigned int id, bool counts_only) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; void *strings; int count, ret; if (id == ETH_SS_PHY_STATS && phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_sset_count) ret = phy_ops->get_sset_count(phydev); else if (ops->get_sset_count && ops->get_strings) ret = ops->get_sset_count(dev, id); else ret = -EOPNOTSUPP; if (ret <= 0) { info->count = 0; return 0; } count = ret; if (!counts_only) { strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL); if (!strings) return -ENOMEM; if (id == ETH_SS_PHY_STATS && phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_strings) phy_ops->get_strings(phydev, strings); else ops->get_strings(dev, id, strings); info->strings = strings; info->free_strings = true; } info->count = count; return 0; } static int strset_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct strset_reply_data *data = STRSET_REPDATA(reply_base); struct net_device *dev = reply_base->dev; struct nlattr **tb = info->attrs; struct phy_device *phydev; unsigned int i; int ret; BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT); memcpy(&data->sets, &info_template, sizeof(data->sets)); if (!dev) { for (i = 0; i < ETH_SS_COUNT; i++) { if ((req_info->req_ids & (1U << i)) && data->sets[i].per_dev) { GENL_SET_ERR_MSG(info, "requested per device strings without dev"); return -EINVAL; } } return 0; } phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_HEADER_FLAGS], info->extack); /* phydev can be NULL, check for errors only */ if (IS_ERR(phydev)) return PTR_ERR(phydev); ret = ethnl_ops_begin(dev); if (ret < 0) goto err_strset; for (i = 0; i < ETH_SS_COUNT; i++) { if (!strset_include(req_info, data, i) || !data->sets[i].per_dev) continue; ret = strset_prepare_set(&data->sets[i], dev, phydev, i, req_info->counts_only); if (ret < 0) goto err_ops; } ethnl_ops_complete(dev); return 0; err_ops: ethnl_ops_complete(dev); err_strset: strset_cleanup_data(reply_base); return ret; } /* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */ static int strset_set_size(const struct strset_info *info, bool counts_only) { unsigned int len = 0; unsigned int i; if (info->count == 0) return 0; if (counts_only) return nla_total_size(2 * nla_total_size(sizeof(u32))); for (i = 0; i < info->count; i++) { const char *str = info->strings[i]; /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */ len += nla_total_size(nla_total_size(sizeof(u32)) + ethnl_strz_size(str)); } /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */ len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len); return nla_total_size(len); } static int strset_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); const struct strset_reply_data *data = STRSET_REPDATA(reply_base); unsigned int i; int len = 0; int ret; len += nla_total_size(0); /* ETHTOOL_A_STRSET_STRINGSETS */ for (i = 0; i < ETH_SS_COUNT; i++) { const struct strset_info *set_info = &data->sets[i]; if (!strset_include(req_info, data, i)) continue; ret = strset_set_size(set_info, req_info->counts_only); if (ret < 0) return ret; len += ret; } return len; } /* fill one string into reply */ static int strset_fill_string(struct sk_buff *skb, const struct strset_info *set_info, u32 idx) { struct nlattr *string_attr; const char *value; value = set_info->strings[idx]; string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING); if (!string_attr) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) || ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value)) goto nla_put_failure; nla_nest_end(skb, string_attr); return 0; nla_put_failure: nla_nest_cancel(skb, string_attr); return -EMSGSIZE; } /* fill one string set into reply */ static int strset_fill_set(struct sk_buff *skb, const struct strset_info *set_info, u32 id, bool counts_only) { struct nlattr *stringset_attr; struct nlattr *strings_attr; unsigned int i; if (!set_info->per_dev && !set_info->strings) return -EOPNOTSUPP; if (set_info->count == 0) return 0; stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET); if (!stringset_attr) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) || nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count)) goto nla_put_failure; if (!counts_only) { strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS); if (!strings_attr) goto nla_put_failure; for (i = 0; i < set_info->count; i++) { if (strset_fill_string(skb, set_info, i) < 0) goto nla_put_failure; } nla_nest_end(skb, strings_attr); } nla_nest_end(skb, stringset_attr); return 0; nla_put_failure: nla_nest_cancel(skb, stringset_attr); return -EMSGSIZE; } static int strset_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); const struct strset_reply_data *data = STRSET_REPDATA(reply_base); struct nlattr *nest; unsigned int i; int ret; nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS); if (!nest) return -EMSGSIZE; for (i = 0; i < ETH_SS_COUNT; i++) { if (strset_include(req_info, data, i)) { ret = strset_fill_set(skb, &data->sets[i], i, req_info->counts_only); if (ret < 0) goto nla_put_failure; } } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return ret; } const struct ethnl_request_ops ethnl_strset_request_ops = { .request_cmd = ETHTOOL_MSG_STRSET_GET, .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY, .hdr_attr = ETHTOOL_A_STRSET_HEADER, .req_info_size = sizeof(struct strset_req_info), .reply_data_size = sizeof(struct strset_reply_data), .allow_nodev_do = true, .parse_request = strset_parse_request, .prepare_data = strset_prepare_data, .reply_size = strset_reply_size, .fill_reply = strset_fill_reply, .cleanup_data = strset_cleanup_data, };
72 5 18 56 7 49 37 15 19 19 18 38 17 19 25 3 11 16 12 13 12 15 12 39 47 2 46 15 19 42 43 2 4 24 11 7 7 6 10 1 8 13 13 3 21 20 2 18 30 2 24 14 24 24 2 20 3 23 2 18 3 5 7 1 3 1 2 16 57 163 36 40 61 200 29 5 18 76 79 142 145 146 142 142 20 3 17 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/time.h> #include <linux/writeback.h> #include <linux/uio.h> #include <linux/random.h> #include <linux/iversion.h> #include "exfat_raw.h" #include "exfat_fs.h" int __exfat_write_inode(struct inode *inode, int sync) { unsigned long long on_disk_size; struct exfat_dentry *ep, *ep2; struct exfat_entry_set_cache es; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); bool is_dir = (ei->type == TYPE_DIR) ? true : false; struct timespec64 ts; if (inode->i_ino == EXFAT_ROOT_INO) return 0; /* * If the inode is already unlinked, there is no need for updating it. */ if (ei->dir.dir == DIR_DELETED) return 0; if (is_dir && ei->dir.dir == sbi->root_dir && ei->entry == -1) return 0; exfat_set_volume_dirty(sb); /* get the directory entry of given file or directory */ if (exfat_get_dentry_set_by_ei(&es, sb, ei)) return -EIO; ep = exfat_get_dentry_cached(&es, ES_IDX_FILE); ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM); ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); /* set FILE_INFO structure using the acquired struct exfat_dentry */ exfat_set_entry_time(sbi, &ei->i_crtime, &ep->dentry.file.create_tz, &ep->dentry.file.create_time, &ep->dentry.file.create_date, &ep->dentry.file.create_time_cs); ts = inode_get_mtime(inode); exfat_set_entry_time(sbi, &ts, &ep->dentry.file.modify_tz, &ep->dentry.file.modify_time, &ep->dentry.file.modify_date, &ep->dentry.file.modify_time_cs); ts = inode_get_atime(inode); exfat_set_entry_time(sbi, &ts, &ep->dentry.file.access_tz, &ep->dentry.file.access_time, &ep->dentry.file.access_date, NULL); /* File size should be zero if there is no cluster allocated */ on_disk_size = i_size_read(inode); if (ei->start_clu == EXFAT_EOF_CLUSTER) on_disk_size = 0; ep2->dentry.stream.size = cpu_to_le64(on_disk_size); /* * mmap write does not use exfat_write_end(), valid_size may be * extended to the sector-aligned length in exfat_get_block(). * So we need to fixup valid_size to the writren length. */ if (on_disk_size < ei->valid_size) ep2->dentry.stream.valid_size = ep2->dentry.stream.size; else ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size); if (on_disk_size) { ep2->dentry.stream.flags = ei->flags; ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu); } else { ep2->dentry.stream.flags = ALLOC_FAT_CHAIN; ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; } exfat_update_dir_chksum(&es); return exfat_put_dentry_set(&es, sync); } int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; if (unlikely(exfat_forced_shutdown(inode->i_sb))) return -EIO; mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); ret = __exfat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); return ret; } void exfat_sync_inode(struct inode *inode) { lockdep_assert_held(&EXFAT_SB(inode->i_sb)->s_lock); __exfat_write_inode(inode, 1); } /* * Input: inode, (logical) clu_offset, target allocation area * Output: errcode, cluster number * *clu = (~0), if it's unable to allocate a new cluster */ static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, unsigned int *clu, int create) { int ret; unsigned int last_clu; struct exfat_chain new_clu; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); unsigned int local_clu_offset = clu_offset; unsigned int num_to_be_allocated = 0, num_clusters; num_clusters = EXFAT_B_TO_CLU(exfat_ondisk_size(inode), sbi); if (clu_offset >= num_clusters) num_to_be_allocated = clu_offset - num_clusters + 1; if (!create && (num_to_be_allocated > 0)) { *clu = EXFAT_EOF_CLUSTER; return 0; } *clu = last_clu = ei->start_clu; if (ei->flags == ALLOC_NO_FAT_CHAIN) { if (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { last_clu += clu_offset - 1; if (clu_offset == num_clusters) *clu = EXFAT_EOF_CLUSTER; else *clu += clu_offset; } } else if (ei->type == TYPE_FILE) { unsigned int fclus = 0; int err = exfat_get_cluster(inode, clu_offset, &fclus, clu, &last_clu, 1); if (err) return -EIO; clu_offset -= fclus; } else { /* hint information */ if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { clu_offset -= ei->hint_bmap.off; /* hint_bmap.clu should be valid */ WARN_ON(ei->hint_bmap.clu < 2); *clu = ei->hint_bmap.clu; } while (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { last_clu = *clu; if (exfat_get_next_cluster(sb, clu)) return -EIO; clu_offset--; } } if (*clu == EXFAT_EOF_CLUSTER) { exfat_set_volume_dirty(sb); new_clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ? EXFAT_EOF_CLUSTER : last_clu + 1; new_clu.size = 0; new_clu.flags = ei->flags; /* allocate a cluster */ if (num_to_be_allocated < 1) { /* Broken FAT (i_sze > allocated FAT) */ exfat_fs_error(sb, "broken FAT chain."); return -EIO; } ret = exfat_alloc_cluster(inode, num_to_be_allocated, &new_clu, inode_needs_sync(inode)); if (ret) return ret; if (new_clu.dir == EXFAT_EOF_CLUSTER || new_clu.dir == EXFAT_FREE_CLUSTER) { exfat_fs_error(sb, "bogus cluster new allocated (last_clu : %u, new_clu : %u)", last_clu, new_clu.dir); return -EIO; } /* append to the FAT chain */ if (last_clu == EXFAT_EOF_CLUSTER) { if (new_clu.flags == ALLOC_FAT_CHAIN) ei->flags = ALLOC_FAT_CHAIN; ei->start_clu = new_clu.dir; } else { if (new_clu.flags != ei->flags) { /* no-fat-chain bit is disabled, * so fat-chain should be synced with * alloc-bitmap */ exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters); ei->flags = ALLOC_FAT_CHAIN; } if (new_clu.flags == ALLOC_FAT_CHAIN) if (exfat_ent_set(sb, last_clu, new_clu.dir)) return -EIO; } num_clusters += num_to_be_allocated; *clu = new_clu.dir; inode->i_blocks += EXFAT_CLU_TO_B(num_to_be_allocated, sbi) >> 9; /* * Move *clu pointer along FAT chains (hole care) because the * caller of this function expect *clu to be the last cluster. * This only works when num_to_be_allocated >= 2, * *clu = (the first cluster of the allocated chain) => * (the last cluster of ...) */ if (ei->flags == ALLOC_NO_FAT_CHAIN) { *clu += num_to_be_allocated - 1; } else { while (num_to_be_allocated > 1) { if (exfat_get_next_cluster(sb, clu)) return -EIO; num_to_be_allocated--; } } } /* hint information */ ei->hint_bmap.off = local_clu_offset; ei->hint_bmap.clu = *clu; return 0; } static int exfat_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct exfat_inode_info *ei = EXFAT_I(inode); struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; int err = 0; unsigned long mapped_blocks = 0; unsigned int cluster, sec_offset; sector_t last_block; sector_t phys = 0; sector_t valid_blks; mutex_lock(&sbi->s_lock); last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); if (iblock >= last_block && !create) goto done; /* Is this block already allocated? */ err = exfat_map_cluster(inode, iblock >> sbi->sect_per_clus_bits, &cluster, create); if (err) { if (err != -ENOSPC) exfat_fs_error_ratelimit(sb, "failed to bmap (inode : %p iblock : %llu, err : %d)", inode, (unsigned long long)iblock, err); goto unlock_ret; } if (cluster == EXFAT_EOF_CLUSTER) goto done; /* sector offset in cluster */ sec_offset = iblock & (sbi->sect_per_clus - 1); phys = exfat_cluster_to_sector(sbi, cluster) + sec_offset; mapped_blocks = sbi->sect_per_clus - sec_offset; max_blocks = min(mapped_blocks, max_blocks); map_bh(bh_result, sb, phys); if (buffer_delay(bh_result)) clear_buffer_delay(bh_result); if (create) { valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); if (iblock + max_blocks < valid_blks) { /* The range has been written, map it */ goto done; } else if (iblock < valid_blks) { /* * The range has been partially written, * map the written part. */ max_blocks = valid_blks - iblock; goto done; } /* The area has not been written, map and mark as new. */ set_buffer_new(bh_result); ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb); mark_inode_dirty(inode); } else { valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); if (iblock + max_blocks < valid_blks) { /* The range has been written, map it */ goto done; } else if (iblock < valid_blks) { /* * The area has been partially written, * map the written part. */ max_blocks = valid_blks - iblock; goto done; } else if (iblock == valid_blks && (ei->valid_size & (sb->s_blocksize - 1))) { /* * The block has been partially written, * zero the unwritten part and map the block. */ loff_t size, off, pos; max_blocks = 1; /* * For direct read, the unwritten part will be zeroed in * exfat_direct_IO() */ if (!bh_result->b_folio) goto done; pos = EXFAT_BLK_TO_B(iblock, sb); size = ei->valid_size - pos; off = pos & (PAGE_SIZE - 1); folio_set_bh(bh_result, bh_result->b_folio, off); err = bh_read(bh_result, 0); if (err < 0) goto unlock_ret; folio_zero_segment(bh_result->b_folio, off + size, off + sb->s_blocksize); } else { /* * The range has not been written, clear the mapped flag * to only zero the cache and do not read from disk. */ clear_buffer_mapped(bh_result); } } done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); unlock_ret: mutex_unlock(&sbi->s_lock); return err; } static int exfat_read_folio(struct file *file, struct folio *folio) { return mpage_read_folio(folio, exfat_get_block); } static void exfat_readahead(struct readahead_control *rac) { struct address_space *mapping = rac->mapping; struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); loff_t pos = readahead_pos(rac); /* Range cross valid_size, read it page by page. */ if (ei->valid_size < i_size_read(inode) && pos <= ei->valid_size && ei->valid_size < pos + readahead_length(rac)) return; mpage_readahead(rac, exfat_get_block); } static int exfat_writepages(struct address_space *mapping, struct writeback_control *wbc) { if (unlikely(exfat_forced_shutdown(mapping->host->i_sb))) return -EIO; return mpage_writepages(mapping, wbc, exfat_get_block); } static void exfat_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); exfat_truncate(inode); } } static int exfat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **foliop, void **fsdata) { int ret; if (unlikely(exfat_forced_shutdown(mapping->host->i_sb))) return -EIO; ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); return ret; } static int exfat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (err < len) exfat_write_failed(mapping, pos+len); if (!(err < 0) && pos + err > ei->valid_size) { ei->valid_size = pos + err; mark_inode_dirty(inode); } if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ei->attr |= EXFAT_ATTR_ARCHIVE; mark_inode_dirty(inode); } return err; } static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); loff_t pos = iocb->ki_pos; loff_t size = pos + iov_iter_count(iter); int rw = iov_iter_rw(iter); ssize_t ret; /* * Need to use the DIO_LOCKING for avoiding the race * condition of exfat_get_block() and ->truncate(). */ ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); if (ret < 0) { if (rw == WRITE && ret != -EIOCBQUEUED) exfat_write_failed(mapping, size); return ret; } else size = pos + ret; if (rw == WRITE) { /* * If the block had been partially written before this write, * ->valid_size will not be updated in exfat_get_block(), * update it here. */ if (ei->valid_size < size) { ei->valid_size = size; mark_inode_dirty(inode); } } else if (pos < ei->valid_size && ei->valid_size < size) { /* zero the unwritten part in the partially written block */ iov_iter_revert(iter, size - ei->valid_size); iov_iter_zero(size - ei->valid_size, iter); } return ret; } static sector_t exfat_aop_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* exfat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&EXFAT_I(mapping->host)->truncate_lock); blocknr = generic_block_bmap(mapping, block, exfat_get_block); up_read(&EXFAT_I(mapping->host)->truncate_lock); return blocknr; } /* * exfat_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This is required during truncate to physically zeroout the tail end * of that block so it doesn't yield old data if the file is later grown. * Also, avoid causing failure from fsx for cases of "data past EOF" */ int exfat_block_truncate_page(struct inode *inode, loff_t from) { return block_truncate_page(inode->i_mapping, from, exfat_get_block); } static const struct address_space_operations exfat_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = exfat_read_folio, .readahead = exfat_readahead, .writepages = exfat_writepages, .write_begin = exfat_write_begin, .write_end = exfat_write_end, .direct_IO = exfat_direct_IO, .bmap = exfat_aop_bmap, .migrate_folio = buffer_migrate_folio, }; static inline unsigned long exfat_hash(loff_t i_pos) { return hash_32(i_pos, EXFAT_HASH_BITS); } void exfat_hash_inode(struct inode *inode, loff_t i_pos) { struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); spin_lock(&sbi->inode_hash_lock); EXFAT_I(inode)->i_pos = i_pos; hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head); spin_unlock(&sbi->inode_hash_lock); } void exfat_unhash_inode(struct inode *inode) { struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); spin_lock(&sbi->inode_hash_lock); hlist_del_init(&EXFAT_I(inode)->i_hash_fat); EXFAT_I(inode)->i_pos = 0; spin_unlock(&sbi->inode_hash_lock); } struct inode *exfat_iget(struct super_block *sb, loff_t i_pos) { struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *info; struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); struct inode *inode = NULL; spin_lock(&sbi->inode_hash_lock); hlist_for_each_entry(info, head, i_hash_fat) { WARN_ON(info->vfs_inode.i_sb != sb); if (i_pos != info->i_pos) continue; inode = igrab(&info->vfs_inode); if (inode) break; } spin_unlock(&sbi->inode_hash_lock); return inode; } /* doesn't deal with root inode */ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) { struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); struct exfat_inode_info *ei = EXFAT_I(inode); loff_t size = info->size; ei->dir = info->dir; ei->entry = info->entry; ei->attr = info->attr; ei->start_clu = info->start_clu; ei->flags = info->flags; ei->type = info->type; ei->valid_size = info->valid_size; ei->version = 0; ei->hint_stat.eidx = 0; ei->hint_stat.clu = info->start_clu; ei->hint_femp.eidx = EXFAT_HINT_NONE; ei->hint_bmap.off = EXFAT_EOF_CLUSTER; ei->i_pos = 0; inode->i_uid = sbi->options.fs_uid; inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); inode->i_generation = get_random_u32(); if (info->attr & EXFAT_ATTR_SUBDIR) { /* directory */ inode->i_generation &= ~1; inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; set_nlink(inode, info->num_subdirs); } else { /* regular file */ inode->i_generation |= 1; inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); inode->i_op = &exfat_file_inode_operations; inode->i_fop = &exfat_file_operations; inode->i_mapping->a_ops = &exfat_aops; inode->i_mapping->nrpages = 0; } i_size_write(inode, size); exfat_save_attr(inode, info->attr); inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; inode_set_mtime_to_ts(inode, info->mtime); inode_set_ctime_to_ts(inode, info->mtime); ei->i_crtime = info->crtime; inode_set_atime_to_ts(inode, info->atime); return 0; } struct inode *exfat_build_inode(struct super_block *sb, struct exfat_dir_entry *info, loff_t i_pos) { struct inode *inode; int err; inode = exfat_iget(sb, i_pos); if (inode) goto out; inode = new_inode(sb); if (!inode) { inode = ERR_PTR(-ENOMEM); goto out; } inode->i_ino = iunique(sb, EXFAT_ROOT_INO); inode_set_iversion(inode, 1); err = exfat_fill_inode(inode, info); if (err) { iput(inode); inode = ERR_PTR(err); goto out; } exfat_hash_inode(inode, i_pos); insert_inode_hash(inode); out: return inode; } void exfat_evict_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); if (!inode->i_nlink) { i_size_write(inode, 0); mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); __exfat_truncate(inode); mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); } invalidate_inode_buffers(inode); clear_inode(inode); exfat_cache_inval_inode(inode); exfat_unhash_inode(inode); }
29 7 1 4 2 2 19 1 11 7 20 12 8 20 22 22 2 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-or-later /* * vimc-debayer.c Virtual Media Controller Driver * * Copyright (C) 2015-2017 Helen Koike <helen.fornazier@gmail.com> */ #include <linux/moduleparam.h> #include <linux/platform_device.h> #include <linux/vmalloc.h> #include <linux/v4l2-mediabus.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/v4l2-subdev.h> #include "vimc-common.h" /* TODO: Add support for more output formats, we only support RGB888 for now. */ #define VIMC_DEBAYER_SOURCE_MBUS_FMT MEDIA_BUS_FMT_RGB888_1X24 enum vimc_debayer_rgb_colors { VIMC_DEBAYER_RED = 0, VIMC_DEBAYER_GREEN = 1, VIMC_DEBAYER_BLUE = 2, }; struct vimc_debayer_pix_map { u32 code; enum vimc_debayer_rgb_colors order[2][2]; }; struct vimc_debayer_device { struct vimc_ent_device ved; struct v4l2_subdev sd; struct v4l2_ctrl_handler hdl; struct media_pad pads[2]; u8 *src_frame; void (*set_rgb_src)(struct vimc_debayer_device *vdebayer, unsigned int lin, unsigned int col, unsigned int rgb[3]); /* * Virtual "hardware" configuration, filled when the stream starts or * when controls are set. */ struct { const struct vimc_debayer_pix_map *sink_pix_map; unsigned int sink_bpp; struct v4l2_area size; unsigned int mean_win_size; u32 src_code; } hw; }; static const struct v4l2_mbus_framefmt sink_fmt_default = { .width = 640, .height = 480, .code = MEDIA_BUS_FMT_SRGGB8_1X8, .field = V4L2_FIELD_NONE, .colorspace = V4L2_COLORSPACE_SRGB, }; static const u32 vimc_debayer_src_mbus_codes[] = { MEDIA_BUS_FMT_GBR888_1X24, MEDIA_BUS_FMT_BGR888_1X24, MEDIA_BUS_FMT_BGR888_3X8, MEDIA_BUS_FMT_RGB888_1X24, MEDIA_BUS_FMT_RGB888_2X12_BE, MEDIA_BUS_FMT_RGB888_2X12_LE, MEDIA_BUS_FMT_RGB888_3X8, MEDIA_BUS_FMT_RGB888_1X7X4_SPWG, MEDIA_BUS_FMT_RGB888_1X7X4_JEIDA, MEDIA_BUS_FMT_RGB888_1X32_PADHI, }; static const struct vimc_debayer_pix_map vimc_debayer_pix_map_list[] = { { .code = MEDIA_BUS_FMT_SBGGR8_1X8, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG8_1X8, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG8_1X8, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB8_1X8, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR10_1X10, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG10_1X10, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG10_1X10, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB10_1X10, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR12_1X12, .order = { { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG12_1X12, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE }, { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG12_1X12, .order = { { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_RED }, { VIMC_DEBAYER_BLUE, VIMC_DEBAYER_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB12_1X12, .order = { { VIMC_DEBAYER_RED, VIMC_DEBAYER_GREEN }, { VIMC_DEBAYER_GREEN, VIMC_DEBAYER_BLUE } } }, }; static const struct vimc_debayer_pix_map *vimc_debayer_pix_map_by_code(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_debayer_pix_map_list); i++) if (vimc_debayer_pix_map_list[i].code == code) return &vimc_debayer_pix_map_list[i]; return NULL; } static bool vimc_debayer_src_code_is_valid(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_debayer_src_mbus_codes); i++) if (vimc_debayer_src_mbus_codes[i] == code) return true; return false; } static int vimc_debayer_init_state(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state) { struct v4l2_mbus_framefmt *mf; mf = v4l2_subdev_state_get_format(sd_state, 0); *mf = sink_fmt_default; mf = v4l2_subdev_state_get_format(sd_state, 1); *mf = sink_fmt_default; mf->code = VIMC_DEBAYER_SOURCE_MBUS_FMT; return 0; } static int vimc_debayer_enum_mbus_code(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_mbus_code_enum *code) { if (VIMC_IS_SRC(code->pad)) { if (code->index >= ARRAY_SIZE(vimc_debayer_src_mbus_codes)) return -EINVAL; code->code = vimc_debayer_src_mbus_codes[code->index]; } else { if (code->index >= ARRAY_SIZE(vimc_debayer_pix_map_list)) return -EINVAL; code->code = vimc_debayer_pix_map_list[code->index].code; } return 0; } static int vimc_debayer_enum_frame_size(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_frame_size_enum *fse) { if (fse->index) return -EINVAL; if (VIMC_IS_SINK(fse->pad)) { const struct vimc_debayer_pix_map *vpix = vimc_debayer_pix_map_by_code(fse->code); if (!vpix) return -EINVAL; } else if (!vimc_debayer_src_code_is_valid(fse->code)) { return -EINVAL; } fse->min_width = VIMC_FRAME_MIN_WIDTH; fse->max_width = VIMC_FRAME_MAX_WIDTH; fse->min_height = VIMC_FRAME_MIN_HEIGHT; fse->max_height = VIMC_FRAME_MAX_HEIGHT; return 0; } static void vimc_debayer_adjust_sink_fmt(struct v4l2_mbus_framefmt *fmt) { const struct vimc_debayer_pix_map *vpix; /* Don't accept a code that is not on the debayer table */ vpix = vimc_debayer_pix_map_by_code(fmt->code); if (!vpix) fmt->code = sink_fmt_default.code; fmt->width = clamp_t(u32, fmt->width, VIMC_FRAME_MIN_WIDTH, VIMC_FRAME_MAX_WIDTH) & ~1; fmt->height = clamp_t(u32, fmt->height, VIMC_FRAME_MIN_HEIGHT, VIMC_FRAME_MAX_HEIGHT) & ~1; if (fmt->field == V4L2_FIELD_ANY) fmt->field = sink_fmt_default.field; vimc_colorimetry_clamp(fmt); } static int vimc_debayer_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_state *sd_state, struct v4l2_subdev_format *fmt) { struct vimc_debayer_device *vdebayer = v4l2_get_subdevdata(sd); struct v4l2_mbus_framefmt *format; /* Do not change the format while stream is on. */ if (fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE && vdebayer->src_frame) return -EBUSY; /* * Do not change the format of the source pad, it is propagated from * the sink. */ if (VIMC_IS_SRC(fmt->pad)) return v4l2_subdev_get_fmt(sd, sd_state, fmt); /* Set the new format in the sink pad. */ vimc_debayer_adjust_sink_fmt(&fmt->format); format = v4l2_subdev_state_get_format(sd_state, 0); dev_dbg(vdebayer->ved.dev, "%s: sink format update: " "old:%dx%d (0x%x, %d, %d, %d, %d) " "new:%dx%d (0x%x, %d, %d, %d, %d)\n", vdebayer->sd.name, /* old */ format->width, format->height, format->code, format->colorspace, format->quantization, format->xfer_func, format->ycbcr_enc, /* new */ fmt->format.width, fmt->format.height, fmt->format.code, fmt->format.colorspace, fmt->format.quantization, fmt->format.xfer_func, fmt->format.ycbcr_enc); *format = fmt->format; /* Propagate the format to the source pad. */ format = v4l2_subdev_state_get_format(sd_state, 1); *format = fmt->format; format->code = VIMC_DEBAYER_SOURCE_MBUS_FMT; return 0; } static const struct v4l2_subdev_pad_ops vimc_debayer_pad_ops = { .enum_mbus_code = vimc_debayer_enum_mbus_code, .enum_frame_size = vimc_debayer_enum_frame_size, .get_fmt = v4l2_subdev_get_fmt, .set_fmt = vimc_debayer_set_fmt, }; static void vimc_debayer_process_rgb_frame(struct vimc_debayer_device *vdebayer, unsigned int lin, unsigned int col, unsigned int rgb[3]) { const struct vimc_pix_map *vpix; unsigned int i, index; vpix = vimc_pix_map_by_code(vdebayer->hw.src_code); index = VIMC_FRAME_INDEX(lin, col, vdebayer->hw.size.width, 3); for (i = 0; i < 3; i++) { switch (vpix->pixelformat) { case V4L2_PIX_FMT_RGB24: vdebayer->src_frame[index + i] = rgb[i]; break; case V4L2_PIX_FMT_BGR24: vdebayer->src_frame[index + i] = rgb[2 - i]; break; } } } static int vimc_debayer_s_stream(struct v4l2_subdev *sd, int enable) { struct vimc_debayer_device *vdebayer = v4l2_get_subdevdata(sd); if (enable) { const struct v4l2_mbus_framefmt *sink_fmt; const struct v4l2_mbus_framefmt *src_fmt; struct v4l2_subdev_state *state; const struct vimc_pix_map *vpix; unsigned int frame_size; if (vdebayer->src_frame) return 0; state = v4l2_subdev_lock_and_get_active_state(sd); sink_fmt = v4l2_subdev_state_get_format(state, 0); src_fmt = v4l2_subdev_state_get_format(state, 1); /* Calculate the frame size of the source pad */ vpix = vimc_pix_map_by_code(src_fmt->code); frame_size = src_fmt->width * src_fmt->height * vpix->bpp; /* Save the bytes per pixel of the sink */ vpix = vimc_pix_map_by_code(sink_fmt->code); vdebayer->hw.sink_bpp = vpix->bpp; /* Get the corresponding pixel map from the table */ vdebayer->hw.sink_pix_map = vimc_debayer_pix_map_by_code(sink_fmt->code); vdebayer->hw.size.width = sink_fmt->width; vdebayer->hw.size.height = sink_fmt->height; vdebayer->hw.src_code = src_fmt->code; v4l2_subdev_unlock_state(state); /* * Allocate the frame buffer. Use vmalloc to be able to * allocate a large amount of memory */ vdebayer->src_frame = vmalloc(frame_size); if (!vdebayer->src_frame) return -ENOMEM; } else { if (!vdebayer->src_frame) return 0; vfree(vdebayer->src_frame); vdebayer->src_frame = NULL; } return 0; } static const struct v4l2_subdev_core_ops vimc_debayer_core_ops = { .log_status = v4l2_ctrl_subdev_log_status, .subscribe_event = v4l2_ctrl_subdev_subscribe_event, .unsubscribe_event = v4l2_event_subdev_unsubscribe, }; static const struct v4l2_subdev_video_ops vimc_debayer_video_ops = { .s_stream = vimc_debayer_s_stream, }; static const struct v4l2_subdev_ops vimc_debayer_ops = { .core = &vimc_debayer_core_ops, .pad = &vimc_debayer_pad_ops, .video = &vimc_debayer_video_ops, }; static const struct v4l2_subdev_internal_ops vimc_debayer_internal_ops = { .init_state = vimc_debayer_init_state, }; static unsigned int vimc_debayer_get_val(const u8 *bytes, const unsigned int n_bytes) { unsigned int i; unsigned int acc = 0; for (i = 0; i < n_bytes; i++) acc = acc + (bytes[i] << (8 * i)); return acc; } static void vimc_debayer_calc_rgb_sink(struct vimc_debayer_device *vdebayer, const u8 *frame, const unsigned int lin, const unsigned int col, unsigned int rgb[3]) { unsigned int i, seek, wlin, wcol; unsigned int n_rgb[3] = {0, 0, 0}; for (i = 0; i < 3; i++) rgb[i] = 0; /* * Calculate how many we need to subtract to get to the pixel in * the top left corner of the mean window (considering the current * pixel as the center) */ seek = vdebayer->hw.mean_win_size / 2; /* Sum the values of the colors in the mean window */ dev_dbg(vdebayer->ved.dev, "deb: %s: --- Calc pixel %dx%d, window mean %d, seek %d ---\n", vdebayer->sd.name, lin, col, vdebayer->hw.size.height, seek); /* * Iterate through all the lines in the mean window, start * with zero if the pixel is outside the frame and don't pass * the height when the pixel is in the bottom border of the * frame */ for (wlin = seek > lin ? 0 : lin - seek; wlin < lin + seek + 1 && wlin < vdebayer->hw.size.height; wlin++) { /* * Iterate through all the columns in the mean window, start * with zero if the pixel is outside the frame and don't pass * the width when the pixel is in the right border of the * frame */ for (wcol = seek > col ? 0 : col - seek; wcol < col + seek + 1 && wcol < vdebayer->hw.size.width; wcol++) { enum vimc_debayer_rgb_colors color; unsigned int index; /* Check which color this pixel is */ color = vdebayer->hw.sink_pix_map->order[wlin % 2][wcol % 2]; index = VIMC_FRAME_INDEX(wlin, wcol, vdebayer->hw.size.width, vdebayer->hw.sink_bpp); dev_dbg(vdebayer->ved.dev, "deb: %s: RGB CALC: frame index %d, win pos %dx%d, color %d\n", vdebayer->sd.name, index, wlin, wcol, color); /* Get its value */ rgb[color] = rgb[color] + vimc_debayer_get_val(&frame[index], vdebayer->hw.sink_bpp); /* Save how many values we already added */ n_rgb[color]++; dev_dbg(vdebayer->ved.dev, "deb: %s: RGB CALC: val %d, n %d\n", vdebayer->sd.name, rgb[color], n_rgb[color]); } } /* Calculate the mean */ for (i = 0; i < 3; i++) { dev_dbg(vdebayer->ved.dev, "deb: %s: PRE CALC: %dx%d Color %d, val %d, n %d\n", vdebayer->sd.name, lin, col, i, rgb[i], n_rgb[i]); if (n_rgb[i]) rgb[i] = rgb[i] / n_rgb[i]; dev_dbg(vdebayer->ved.dev, "deb: %s: FINAL CALC: %dx%d Color %d, val %d\n", vdebayer->sd.name, lin, col, i, rgb[i]); } } static void *vimc_debayer_process_frame(struct vimc_ent_device *ved, const void *sink_frame) { struct vimc_debayer_device *vdebayer = container_of(ved, struct vimc_debayer_device, ved); unsigned int rgb[3]; unsigned int i, j; /* If the stream in this node is not active, just return */ if (!vdebayer->src_frame) return ERR_PTR(-EINVAL); for (i = 0; i < vdebayer->hw.size.height; i++) for (j = 0; j < vdebayer->hw.size.width; j++) { vimc_debayer_calc_rgb_sink(vdebayer, sink_frame, i, j, rgb); vdebayer->set_rgb_src(vdebayer, i, j, rgb); } return vdebayer->src_frame; } static int vimc_debayer_s_ctrl(struct v4l2_ctrl *ctrl) { struct vimc_debayer_device *vdebayer = container_of(ctrl->handler, struct vimc_debayer_device, hdl); switch (ctrl->id) { case VIMC_CID_MEAN_WIN_SIZE: vdebayer->hw.mean_win_size = ctrl->val; break; default: return -EINVAL; } return 0; } static const struct v4l2_ctrl_ops vimc_debayer_ctrl_ops = { .s_ctrl = vimc_debayer_s_ctrl, }; static void vimc_debayer_release(struct vimc_ent_device *ved) { struct vimc_debayer_device *vdebayer = container_of(ved, struct vimc_debayer_device, ved); v4l2_ctrl_handler_free(&vdebayer->hdl); v4l2_subdev_cleanup(&vdebayer->sd); media_entity_cleanup(vdebayer->ved.ent); kfree(vdebayer); } static const struct v4l2_ctrl_config vimc_debayer_ctrl_class = { .flags = V4L2_CTRL_FLAG_READ_ONLY | V4L2_CTRL_FLAG_WRITE_ONLY, .id = VIMC_CID_VIMC_CLASS, .name = "VIMC Controls", .type = V4L2_CTRL_TYPE_CTRL_CLASS, }; static const struct v4l2_ctrl_config vimc_debayer_ctrl_mean_win_size = { .ops = &vimc_debayer_ctrl_ops, .id = VIMC_CID_MEAN_WIN_SIZE, .name = "Debayer Mean Window Size", .type = V4L2_CTRL_TYPE_INTEGER, .min = 1, .max = 25, .step = 2, .def = 3, }; static struct vimc_ent_device *vimc_debayer_add(struct vimc_device *vimc, const char *vcfg_name) { struct v4l2_device *v4l2_dev = &vimc->v4l2_dev; struct vimc_debayer_device *vdebayer; int ret; /* Allocate the vdebayer struct */ vdebayer = kzalloc(sizeof(*vdebayer), GFP_KERNEL); if (!vdebayer) return ERR_PTR(-ENOMEM); /* Create controls: */ v4l2_ctrl_handler_init(&vdebayer->hdl, 2); v4l2_ctrl_new_custom(&vdebayer->hdl, &vimc_debayer_ctrl_class, NULL); v4l2_ctrl_new_custom(&vdebayer->hdl, &vimc_debayer_ctrl_mean_win_size, NULL); vdebayer->sd.ctrl_handler = &vdebayer->hdl; if (vdebayer->hdl.error) { ret = vdebayer->hdl.error; goto err_free_vdebayer; } /* Initialize ved and sd */ vdebayer->pads[0].flags = MEDIA_PAD_FL_SINK; vdebayer->pads[1].flags = MEDIA_PAD_FL_SOURCE; ret = vimc_ent_sd_register(&vdebayer->ved, &vdebayer->sd, v4l2_dev, vcfg_name, MEDIA_ENT_F_PROC_VIDEO_PIXEL_ENC_CONV, 2, vdebayer->pads, &vimc_debayer_internal_ops, &vimc_debayer_ops); if (ret) goto err_free_hdl; vdebayer->ved.process_frame = vimc_debayer_process_frame; vdebayer->ved.dev = vimc->mdev.dev; vdebayer->hw.mean_win_size = vimc_debayer_ctrl_mean_win_size.def; vdebayer->set_rgb_src = vimc_debayer_process_rgb_frame; return &vdebayer->ved; err_free_hdl: v4l2_ctrl_handler_free(&vdebayer->hdl); err_free_vdebayer: kfree(vdebayer); return ERR_PTR(ret); } const struct vimc_ent_type vimc_debayer_type = { .add = vimc_debayer_add, .release = vimc_debayer_release };
678 3 917 4 4 1031 46 1021 346 297 3 43 579 238 459 452 21 21 712 545 7 207 207 455 135 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/l3mdev/l3mdev.c - L3 master device implementation * Copyright (c) 2015 Cumulus Networks * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/netdevice.h> #include <net/fib_rules.h> #include <net/l3mdev.h> static DEFINE_SPINLOCK(l3mdev_lock); struct l3mdev_handler { lookup_by_table_id_t dev_lookup; }; static struct l3mdev_handler l3mdev_handlers[L3MDEV_TYPE_MAX + 1]; static int l3mdev_check_type(enum l3mdev_type l3type) { if (l3type <= L3MDEV_TYPE_UNSPEC || l3type > L3MDEV_TYPE_MAX) return -EINVAL; return 0; } int l3mdev_table_lookup_register(enum l3mdev_type l3type, lookup_by_table_id_t fn) { struct l3mdev_handler *hdlr; int res; res = l3mdev_check_type(l3type); if (res) return res; hdlr = &l3mdev_handlers[l3type]; spin_lock(&l3mdev_lock); if (hdlr->dev_lookup) { res = -EBUSY; goto unlock; } hdlr->dev_lookup = fn; res = 0; unlock: spin_unlock(&l3mdev_lock); return res; } EXPORT_SYMBOL_GPL(l3mdev_table_lookup_register); void l3mdev_table_lookup_unregister(enum l3mdev_type l3type, lookup_by_table_id_t fn) { struct l3mdev_handler *hdlr; if (l3mdev_check_type(l3type)) return; hdlr = &l3mdev_handlers[l3type]; spin_lock(&l3mdev_lock); if (hdlr->dev_lookup == fn) hdlr->dev_lookup = NULL; spin_unlock(&l3mdev_lock); } EXPORT_SYMBOL_GPL(l3mdev_table_lookup_unregister); int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, u32 table_id) { lookup_by_table_id_t lookup; struct l3mdev_handler *hdlr; int ifindex = -EINVAL; int res; res = l3mdev_check_type(l3type); if (res) return res; hdlr = &l3mdev_handlers[l3type]; spin_lock(&l3mdev_lock); lookup = hdlr->dev_lookup; if (!lookup) goto unlock; ifindex = lookup(net, table_id); unlock: spin_unlock(&l3mdev_lock); return ifindex; } EXPORT_SYMBOL_GPL(l3mdev_ifindex_lookup_by_table_id); /** * l3mdev_master_ifindex_rcu - get index of L3 master device * @dev: targeted interface */ int l3mdev_master_ifindex_rcu(const struct net_device *dev) { int ifindex = 0; if (!dev) return 0; if (netif_is_l3_master(dev)) { ifindex = dev->ifindex; } else if (netif_is_l3_slave(dev)) { struct net_device *master; struct net_device *_dev = (struct net_device *)dev; /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. * list_first_or_null_rcu does not handle a const arg. We aren't * making changes, just want the master device from that list so * typecast to remove the const */ master = netdev_master_upper_dev_get_rcu(_dev); if (master) ifindex = master->ifindex; } return ifindex; } EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); /** * l3mdev_master_upper_ifindex_by_index_rcu - get index of upper l3 master * device * @net: network namespace for device index lookup * @ifindex: targeted interface */ int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; dev = dev_get_by_index_rcu(net, ifindex); while (dev && !netif_is_l3_master(dev)) dev = netdev_master_upper_dev_get_rcu(dev); return dev ? dev->ifindex : 0; } EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); /** * l3mdev_fib_table_rcu - get FIB table id associated with an L3 * master interface * @dev: targeted interface */ u32 l3mdev_fib_table_rcu(const struct net_device *dev) { u32 tb_id = 0; if (!dev) return 0; if (netif_is_l3_master(dev)) { if (dev->l3mdev_ops->l3mdev_fib_table) tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev); } else if (netif_is_l3_slave(dev)) { /* Users of netdev_master_upper_dev_get_rcu need non-const, * but current inet_*type functions take a const */ struct net_device *_dev = (struct net_device *) dev; const struct net_device *master; master = netdev_master_upper_dev_get_rcu(_dev); if (master && master->l3mdev_ops->l3mdev_fib_table) tb_id = master->l3mdev_ops->l3mdev_fib_table(master); } return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_rcu); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) { struct net_device *dev; u32 tb_id = 0; if (!ifindex) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) tb_id = l3mdev_fib_table_rcu(dev); rcu_read_unlock(); return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); /** * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup * This function does not hold refcnt on the returned dst. * Caller must hold rcu_read_lock(). */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; WARN_ON_ONCE(!rcu_read_lock_held()); if (fl6->flowi6_oif) { dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); } return dst; } EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); /** * l3mdev_fib_rule_match - Determine if flowi references an * L3 master device * @net: network namespace for device index lookup * @fl: flow struct * @arg: store the table the rule matched with here */ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg) { struct net_device *dev; int rc = 0; /* update flow ensures flowi_l3mdev is set when relevant */ if (!fl->flowi_l3mdev) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, fl->flowi_l3mdev); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_fib_table) { arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); rc = 1; } rcu_read_unlock(); return rc; } void l3mdev_update_flow(struct net *net, struct flowi *fl) { struct net_device *dev; rcu_read_lock(); if (fl->flowi_oif) { dev = dev_get_by_index_rcu(net, fl->flowi_oif); if (dev) { if (!fl->flowi_l3mdev) fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev); /* oif set to L3mdev directs lookup to its table; * reset to avoid oif match in fib_lookup */ if (netif_is_l3_master(dev)) fl->flowi_oif = 0; goto out; } } if (fl->flowi_iif > LOOPBACK_IFINDEX && !fl->flowi_l3mdev) { dev = dev_get_by_index_rcu(net, fl->flowi_iif); if (dev) fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev); } out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(l3mdev_update_flow);
41 14 29 20 32 52 52 52 51 52 52 43 43 32 20 16 32 32 32 11 11 11 11 3 9 11 11 11 43 43 32 7 19 43 23 43 43 43 37 1 36 62 37 61 61 1 1 1 1 1 1 54 54 1 36 44 51 52 1 45 36 52 3 37 36 37 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_icreate_item.h" #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_health.h" /* * Lookup a record by ino in the btree given by cur. */ int /* error */ xfs_inobt_lookup( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agino_t ino, /* starting inode of chunk */ xfs_lookup_t dir, /* <=, >=, == */ int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; cur->bc_rec.i.ir_holemask = 0; cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); } /* * Update the record referred to by cur to the value given. * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int /* error */ xfs_inobt_update( struct xfs_btree_cur *cur, /* btree cursor */ xfs_inobt_rec_incore_t *irec) /* btree record */ { union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); if (xfs_has_sparseinodes(cur->bc_mp)) { rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); rec.inobt.ir_u.sp.ir_count = irec->ir_count; rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; } else { /* ir_holemask/ir_count not supported on-disk */ rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } /* Convert on-disk btree record to incore inobt record. */ void xfs_inobt_btrec_to_irec( struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec) { irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); if (xfs_has_sparseinodes(mp)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; } else { /* * ir_holemask/ir_count not supported on-disk. Fill in hardcoded * values for full inode chunks. */ irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; irec->ir_count = XFS_INODES_PER_CHUNK; irec->ir_freecount = be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } /* Compute the freecount of an incore inode record. */ uint8_t xfs_inobt_rec_freecount( const struct xfs_inobt_rec_incore *irec) { uint64_t realfree = irec->ir_free; if (xfs_inobt_issparse(irec->ir_holemask)) realfree &= xfs_inobt_irec_to_allocmask(irec); return hweight64(realfree); } /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { /* Record has to be properly aligned within the AG. */ if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) return __this_address; if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; } static inline int xfs_inobt_complain_bad_rec( struct xfs_btree_cur *cur, xfs_failaddr_t fa, const struct xfs_inobt_rec_incore *irec) { struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, irec->ir_free, irec->ir_holemask); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } /* * Get the data from the pointed-to record. */ int xfs_inobt_get_rec( struct xfs_btree_cur *cur, struct xfs_inobt_rec_incore *irec, int *stat) { struct xfs_mount *mp = cur->bc_mp; union xfs_btree_rec *rec; xfs_failaddr_t fa; int error; error = xfs_btree_get_rec(cur, &rec, stat); if (error || *stat == 0) return error; xfs_inobt_btrec_to_irec(mp, rec, irec); fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); return 0; } /* * Insert a single inobt record. Cursor must already point to desired location. */ int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, uint16_t holemask, uint8_t count, int32_t freecount, xfs_inofree_t free, int *stat) { cur->bc_rec.i.ir_holemask = holemask; cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); } /* * Insert records describing a newly allocated inode chunk into the inobt. */ STATIC int xfs_inobt_insert( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t newino, xfs_agino_t newlen, bool is_finobt) { struct xfs_btree_cur *cur; xfs_agino_t thisino; int i; int error; if (is_finobt) cur = xfs_finobt_init_cursor(pag, tp, agbp); else cur = xfs_inobt_init_cursor(pag, tp, agbp); for (thisino = newino; thisino < newino + newlen; thisino += XFS_INODES_PER_CHUNK) { error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 0); error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, XFS_INODES_PER_CHUNK, XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 1); } xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; } /* * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG static int xfs_check_agi_freecount( struct xfs_btree_cur *cur) { if (cur->bc_nlevels == 1) { xfs_inobt_rec_incore_t rec; int freecount = 0; int error; int i; error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; do { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; if (i) { freecount += rec.ir_freecount; error = xfs_btree_increment(cur, 0, &i); if (error) return error; } } while (i == 1); if (!xfs_is_shutdown(cur->bc_mp)) { ASSERT(freecount == to_perag(cur->bc_group)->pagi_freecount); } } return 0; } #else #define xfs_check_agi_freecount(cur) 0 #endif /* * Initialise a new set of inodes. When called without a transaction context * (e.g. from recovery) we initiate a delayed write of the inode buffers rather * than logging them (which in a transaction context puts them into the AIL * for writeback rather than the xfsbufd queue). */ int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen) { struct xfs_buf *fbuf; struct xfs_dinode *free; int nbufs; int version; int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; int error; /* * Loop over the new block(s), filling in the inodes. For small block * sizes, manipulate the inodes in buffers which are multiples of the * blocks size. */ nbufs = length / M_IGEO(mp)->blocks_per_cluster; /* * Figure out what version number to use in the inodes we create. If * the superblock version has caught up to the one that supports the new * inode format, then use the new inode version. Otherwise use the old * version so that old kernels will continue to be able to use the file * system. * * For v3 inodes, we also need to write the inode number into the inode, * so calculate the first inode number of the chunk here as * XFS_AGB_TO_AGINO() only works within a filesystem block, not * across multiple filesystem blocks (such as a cluster) and so cannot * be used in the cluster buffer loop below. * * Further, because we are writing the inode directly into the buffer * and calculating a CRC on the entire inode, we have ot log the entire * inode so that the entire range the CRC covers is present in the log. * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ if (xfs_has_v3inodes(mp)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); /* * log the initialisation that is about to take place as an * logical operation. This means the transaction does not * need to log the physical changes to the inode buffers as log * recovery will know what initialisation is actually needed. * Hence we only need to log the buffers as "ordered" buffers so * they track in the AIL as if they were physically logged. */ if (tp) xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; for (j = 0; j < nbufs; j++) { /* * Get the block. */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, XBF_UNMAPPED, &fbuf); if (error) return error; /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); if (version == 3) { free->di_ino = cpu_to_be64(ino); ino++; uuid_copy(&free->di_uuid, &mp->m_sb.sb_meta_uuid); xfs_dinode_calc_crc(mp, free); } else if (tp) { /* just log the inode core */ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + XFS_DINODE_SIZE(mp) - 1); } } if (tp) { /* * Mark the buffer as an inode allocation buffer so it * sticks in AIL at the point of this allocation * transaction. This ensures the they are on disk before * the tail of the log can be moved past this * transaction (i.e. by preventing relogging from moving * it forward in the log). */ xfs_trans_inode_alloc_buf(tp, fbuf); if (version == 3) { /* * Mark the buffer as ordered so that they are * not physically logged in the transaction but * still tracked in the AIL as part of the * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); } } else { fbuf->b_flags |= XBF_DONE; xfs_buf_delwri_queue(fbuf, buffer_list); xfs_buf_relse(fbuf); } } return 0; } /* * Align startino and allocmask for a recently allocated sparse chunk such that * they are fit for insertion (or merge) into the on-disk inode btrees. * * Background: * * When enabled, sparse inode support increases the inode alignment from cluster * size to inode chunk size. This means that the minimum range between two * non-adjacent inode records in the inobt is large enough for a full inode * record. This allows for cluster sized, cluster aligned block allocation * without need to worry about whether the resulting inode record overlaps with * another record in the tree. Without this basic rule, we would have to deal * with the consequences of overlap by potentially undoing recent allocations in * the inode allocation codepath. * * Because of this alignment rule (which is enforced on mount), there are two * inobt possibilities for newly allocated sparse chunks. One is that the * aligned inode record for the chunk covers a range of inodes not already * covered in the inobt (i.e., it is safe to insert a new sparse record). The * other is that a record already exists at the aligned startino that considers * the newly allocated range as sparse. In the latter case, record content is * merged in hope that sparse inode chunks fill to full chunks over time. */ STATIC void xfs_align_sparse_ino( struct xfs_mount *mp, xfs_agino_t *startino, uint16_t *allocmask) { xfs_agblock_t agbno; xfs_agblock_t mod; int offset; agbno = XFS_AGINO_TO_AGBNO(mp, *startino); mod = agbno % mp->m_sb.sb_inoalignmt; if (!mod) return; /* calculate the inode offset and align startino */ offset = XFS_AGB_TO_AGINO(mp, mod); *startino -= offset; /* * Since startino has been aligned down, left shift allocmask such that * it continues to represent the same physical inodes relative to the * new startino. */ *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; } /* * Determine whether the source inode record can merge into the target. Both * records must be sparse, the inode ranges must match and there must be no * allocation overlap between the records. */ STATIC bool __xfs_inobt_can_merge( struct xfs_inobt_rec_incore *trec, /* tgt record */ struct xfs_inobt_rec_incore *srec) /* src record */ { uint64_t talloc; uint64_t salloc; /* records must cover the same inode range */ if (trec->ir_startino != srec->ir_startino) return false; /* both records must be sparse */ if (!xfs_inobt_issparse(trec->ir_holemask) || !xfs_inobt_issparse(srec->ir_holemask)) return false; /* both records must track some inodes */ if (!trec->ir_count || !srec->ir_count) return false; /* can't exceed capacity of a full record */ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) return false; /* verify there is no allocation overlap */ talloc = xfs_inobt_irec_to_allocmask(trec); salloc = xfs_inobt_irec_to_allocmask(srec); if (talloc & salloc) return false; return true; } /* * Merge the source inode record into the target. The caller must call * __xfs_inobt_can_merge() to ensure the merge is valid. */ STATIC void __xfs_inobt_rec_merge( struct xfs_inobt_rec_incore *trec, /* target */ struct xfs_inobt_rec_incore *srec) /* src */ { ASSERT(trec->ir_startino == srec->ir_startino); /* combine the counts */ trec->ir_count += srec->ir_count; trec->ir_freecount += srec->ir_freecount; /* * Merge the holemask and free mask. For both fields, 0 bits refer to * allocated inodes. We combine the allocated ranges with bitwise AND. */ trec->ir_holemask &= srec->ir_holemask; trec->ir_free &= srec->ir_free; } /* * Insert a new sparse inode chunk into the associated inode allocation btree. * The inode record for the sparse chunk is pre-aligned to a startino that * should match any pre-existing sparse inode record in the tree. This allows * sparse chunks to fill over time. * * If no preexisting record exists, the provided record is inserted. * If there is a preexisting record, the provided record is merged with the * existing record and updated in place. The merged record is returned in nrec. * * It is considered corruption if a merge is requested and not possible. Given * the sparse inode alignment constraints, this should never happen. */ STATIC int xfs_inobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; struct xfs_inobt_rec_incore rec; cur = xfs_inobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; /* if nothing there, insert a new record and return */ if (i == 0) { error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, nrec->ir_count, nrec->ir_freecount, nrec->ir_free, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } goto out; } /* * A record exists at this startino. Merge the records. */ error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } /* * This should never fail. If we have coexisting records that * cannot merge, something is seriously wrong. */ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } trace_xfs_irec_merge_pre(pag, &rec, nrec); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); trace_xfs_irec_merge_post(pag, nrec); error = xfs_inobt_rec_check_count(mp, nrec); if (error) goto error; error = xfs_inobt_update(cur, nrec); if (error) goto error; out: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Insert a new sparse inode chunk into the free inode btree. The inode * record for the sparse chunk is pre-aligned to a startino that should match * any pre-existing sparse inode record in the tree. This allows sparse chunks * to fill over time. * * The new record is always inserted, overwriting a pre-existing record if * there is one. */ STATIC int xfs_finobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ { struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; cur = xfs_finobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; /* if nothing there, insert a new record and return */ if (i == 0) { error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, nrec->ir_count, nrec->ir_freecount, nrec->ir_free, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } } else { error = xfs_inobt_update(cur, nrec); if (error) goto error; } xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Allocate new inodes in the allocation group specified by agbp. Returns 0 if * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so * the caller knows it can try another AG, a hard -ENOSPC when over the maximum * inode count threshold, or the usual negative error code for other errors. */ STATIC int xfs_ialloc_ag_alloc( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp) { struct xfs_agi *agi; struct xfs_alloc_arg args; int error; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe */ /* unit boundary */ /* init. to full chunk */ struct xfs_inobt_rec_incore rec; struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp); uint16_t allocmask = (uint16_t) -1; int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; args.oinfo = XFS_RMAP_OINFO_INODES; args.pag = pag; #ifdef DEBUG /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) do_sparse = get_random_u32_below(2); #endif /* * Locking will ensure that we don't have two callers in here * at one time. */ newlen = igeo->ialloc_inos; if (igeo->maxicount && percpu_counter_read_positive(&args.mp->m_icount) + newlen > igeo->maxicount) return -ENOSPC; args.minlen = args.maxlen = igeo->ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + igeo->ialloc_blks; if (do_sparse) goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.prod = 1; /* * We need to take into account alignment here to ensure that * we don't modify the free list if we fail to have an exact * block. If we don't have an exact match, and every oher * attempt allocation attempt fails, we'll end up cancelling * a dirty transaction and shutting down. * * For an exact allocation, alignment must be 1, * however we need to take cluster alignment into account when * fixing up the freelist. Use the minalignslop field to * indicate that extra blocks might be required for alignment, * but not to use them in the actual exact allocation. */ args.alignment = 1; args.minalignslop = igeo->cluster_align - 1; /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_exact_bno(&args, xfs_agbno_to_fsb(pag, args.agbno)); if (error) return error; /* * This request might have dirtied the transaction if the AG can * satisfy the request, but the exact block was not available. * If the allocation did fail, subsequent requests will relax * the exact agbno requirement and increase the alignment * instead. It is critical that the total size of the request * (len + alignment + slop) does not increase from this point * on, so reset minalignslop to ensure it is not included in * subsequent requests. */ args.minalignslop = 0; } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* * Set the alignment for the allocation. * If stripe alignment is turned on then align at stripe unit * boundary. * If the cluster size is smaller than a filesystem block * then we're doing I/O for inodes in filesystem block size * pieces, so don't need alignment anyway. */ isaligned = 0; if (igeo->ialloc_align) { ASSERT(!xfs_has_noalign(args.mp)); args.alignment = args.mp->m_dalign; isaligned = 1; } else args.alignment = igeo->cluster_align; /* * Allocate a fixed-size extent of inodes. */ args.prod = 1; /* * Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_near_bno(&args, xfs_agbno_to_fsb(pag, be32_to_cpu(agi->agi_root))); if (error) return error; } /* * If stripe alignment is turned on, then try again with cluster * alignment. */ if (isaligned && args.fsbno == NULLFSBLOCK) { args.alignment = igeo->cluster_align; error = xfs_alloc_vextent_near_bno(&args, xfs_agbno_to_fsb(pag, be32_to_cpu(agi->agi_root))); if (error) return error; } /* * Finally, try a sparse allocation if the filesystem supports it and * the sparse allocation length is smaller than a full chunk. */ if (xfs_has_sparseinodes(args.mp) && igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; args.minlen = igeo->ialloc_min_blks; args.maxlen = args.minlen; /* * The inode record will be aligned to full chunk size. We must * prevent sparse allocation from AG boundaries that result in * invalid inode records, such as records that start at agbno 0 * or extend beyond the AG. * * Set min agbno to the first aligned, non-zero agbno and max to * the last aligned agbno that is at least one full chunk from * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; args.max_agbno = round_down(xfs_ag_block_count(args.mp, pag_agno(pag)), args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; error = xfs_alloc_vextent_near_bno(&args, xfs_agbno_to_fsb(pag, be32_to_cpu(agi->agi_root))); if (error) return error; newlen = XFS_AGB_TO_AGINO(args.mp, args.len); ASSERT(newlen <= XFS_INODES_PER_CHUNK); allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; } if (args.fsbno == NULLFSBLOCK) return -EAGAIN; ASSERT(args.len == args.minlen); /* * Stamp and write the inode buffers. * * Seed the new inode cluster with a random generation number. This * prevents short-term reuse of generation numbers if a chunk is * freed and then immediately reallocated. We use random numbers * rather than a linear progression to prevent the next generation * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag), args.agbno, args.len, get_random_u32()); if (error) return error; /* * Convert the results. */ newino = XFS_AGB_TO_AGINO(args.mp, args.agbno); if (xfs_inobt_issparse(~allocmask)) { /* * We've allocated a sparse chunk. Align the startino and mask. */ xfs_align_sparse_ino(args.mp, &newino, &allocmask); rec.ir_startino = newino; rec.ir_holemask = ~allocmask; rec.ir_count = newlen; rec.ir_freecount = newlen; rec.ir_free = XFS_INOBT_ALL_FREE; /* * Insert the sparse record into the inobt and allow for a merge * if necessary. If a merge does occur, rec is updated to the * merged record. */ error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", xfs_agino_to_ino(pag, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } if (error) return error; /* * We can't merge the part we've just allocated as for the inobt * due to finobt semantics. The original record may or may not * exist independent of whether physical inodes exist in this * sparse chunk. * * We must update the finobt record based on the inobt record. * rec contains the fully merged and up to date inobt record * from the previous call. Set merge false to replace any * existing record with this one. */ if (xfs_has_finobt(args.mp)) { error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false); if (error) return error; if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, true); if (error) return error; } } /* * Update AGI counts and newino. */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag->pagi_freecount += newlen; pag->pagi_count += newlen; agi->agi_newino = cpu_to_be32(newino); /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO); /* * Modify/log superblock values for inode count and inode free count. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); return 0; } /* * Try to retrieve the next record to the left/right from the current one. */ STATIC int xfs_ialloc_next_rec( struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *done, int left) { int error; int i; if (left) error = xfs_btree_decrement(cur, 0, &i); else error = xfs_btree_increment(cur, 0, &i); if (error) return error; *done = !i; if (i) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } } return 0; } STATIC int xfs_ialloc_get_rec( struct xfs_btree_cur *cur, xfs_agino_t agino, xfs_inobt_rec_incore_t *rec, int *done) { int error; int i; error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); if (error) return error; *done = !i; if (i) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } } return 0; } /* * Return the offset of the first free inode in the record. If the inode chunk * is sparsely allocated, we convert the record holemask to inode granularity * and mask off the unallocated regions from the inode free mask. */ STATIC int xfs_inobt_first_free_inode( struct xfs_inobt_rec_incore *rec) { xfs_inofree_t realfree; /* if there are no holes, return the first available offset */ if (!xfs_inobt_issparse(rec->ir_holemask)) return xfs_lowbit64(rec->ir_free); realfree = xfs_inobt_irec_to_allocmask(rec); realfree &= rec->ir_free; return xfs_lowbit64(realfree); } /* * If this AG has corrupt inodes, check if allocating this inode would fail * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again * somewhere else. */ static int xfs_dialloc_check_ino( struct xfs_perag *pag, struct xfs_trans *tp, xfs_ino_t ino) { struct xfs_imap imap; struct xfs_buf *bp; int error; error = xfs_imap(pag, tp, ino, &imap, 0); if (error) return -EAGAIN; error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp); if (error) return -EAGAIN; xfs_trans_brelse(tp, bp); return 0; } /* * Allocate an inode using the inobt-only algorithm. */ STATIC int xfs_dialloc_ag_inobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; int error; int offset; int i, j; int searchdistance = 10; ASSERT(xfs_perag_initialised_agi(pag)); ASSERT(xfs_perag_allows_inodes(pag)); ASSERT(pag->pagi_freecount > 0); restart_pagno: cur = xfs_inobt_init_cursor(pag, tp, agbp); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * If in the same AG as the parent, try to get near the parent. */ if (pagno == pag_agno(pag)) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; if (XFS_IS_CORRUPT(mp, j != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } if (rec.ir_freecount > 0) { /* * Found a free inode in the same chunk * as the parent, done. */ goto alloc_inode; } /* * In the same AG as parent, but parent's chunk is full. */ /* duplicate the cursor, search left & right simultaneously */ error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; /* * Skip to last blocks looked up if same parent inode. */ if (pagino != NULLAGINO && pag->pagl_pagino == pagino && pag->pagl_leftrec != NULLAGINO && pag->pagl_rightrec != NULLAGINO) { error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, &trec, &doneleft); if (error) goto error1; error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, &rec, &doneright); if (error) goto error1; } else { /* search left with tcur, back up 1 record */ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); if (error) goto error1; /* search right with cur, go forward 1 record. */ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); if (error) goto error1; } /* * Loop until we find an inode chunk with a free inode. */ while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < rec.ir_startino - pagino; } else { useleft = !doneleft; } /* free inodes to the left? */ if (useleft && trec.ir_freecount) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; rec = trec; goto alloc_inode; } /* free inodes to the right? */ if (!useleft && rec.ir_freecount) { xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; goto alloc_inode; } /* get next record to check */ if (useleft) { error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); } else { error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); } if (error) goto error1; } if (searchdistance <= 0) { /* * Not in range - save last search * location and allocate a new inode */ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; } else { /* * We've reached the end of the btree. because * we are only searching a small chunk of the * btree each search, there is obviously free * inodes closer to the parent inode than we * are now. restart the search again. */ pag->pagl_pagino = NULLAGINO; pag->pagl_leftrec = NULLAGINO; pag->pagl_rightrec = NULLAGINO; xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); goto restart_pagno; } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) goto error0; if (i == 1) { error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; if (j == 1 && rec.ir_freecount > 0) { /* * The last chunk allocated in the group * still has a free inode. */ goto alloc_inode; } } } /* * None left in the last group, search the whole AG */ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } } alloc_inode: offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); if (error) goto error0; } rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); if (error) goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; error = xfs_check_agi_freecount(cur); if (error) goto error0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Use the free inode btree to allocate an inode based on distance from the * parent. Note that the provided cursor may be deleted and replaced. */ STATIC int xfs_dialloc_ag_finobt_near( xfs_agino_t pagino, struct xfs_btree_cur **ocur, struct xfs_inobt_rec_incore *rec) { struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ struct xfs_btree_cur *rcur; /* right search cursor */ struct xfs_inobt_rec_incore rrec; int error; int i, j; error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); if (error) return error; if (i == 1) { error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) { xfs_btree_mark_sick(lcur); return -EFSCORRUPTED; } /* * See if we've landed in the parent inode record. The finobt * only tracks chunks with at least one free inode, so record * existence is enough. */ if (pagino >= rec->ir_startino && pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) return 0; } error = xfs_btree_dup_cursor(lcur, &rcur); if (error) return error; error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); if (error) goto error_rcur; if (j == 1) { error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } } if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer * inode chunk to the target. */ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > (rrec.ir_startino - pagino)) { *rec = rrec; xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); *ocur = rcur; } else { xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); } } else if (j == 1) { /* only the right record is valid */ *rec = rrec; xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); *ocur = rcur; } else if (i == 1) { /* only the left record is valid */ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); } return 0; error_rcur: xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); return error; } /* * Use the free inode btree to find a free inode based on a newino hint. If * the hint is NULL, find the first free inode in the AG. */ STATIC int xfs_dialloc_ag_finobt_newino( struct xfs_agi *agi, struct xfs_btree_cur *cur, struct xfs_inobt_rec_incore *rec) { int error; int i; if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) return error; if (i == 1) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } } /* * Find the first inode available in the AG. */ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } /* * Update the inobt based on a modification made to the finobt. Also ensure that * the records from both trees are equivalent post-modification. */ STATIC int xfs_dialloc_ag_update_inobt( struct xfs_btree_cur *cur, /* inobt cursor */ struct xfs_inobt_rec_incore *frec, /* finobt record */ int offset) /* inode offset */ { struct xfs_inobt_rec_incore rec; int error; int i; error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if (XFS_IS_CORRUPT(cur->bc_mp, rec.ir_free != frec->ir_free || rec.ir_freecount != frec->ir_freecount)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return xfs_inobt_update(cur, &rec); } /* * Allocate an inode using the free inode btree, if available. Otherwise, fall * back to the inobt search algorithm. * * The caller selected an AG for us, and made sure that free inodes are * available. */ static int xfs_dialloc_ag( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur; /* finobt cursor */ struct xfs_btree_cur *icur; /* inobt cursor */ struct xfs_inobt_rec_incore rec; xfs_ino_t ino; int error; int offset; int i; if (!xfs_has_finobt(mp)) return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) goto error_cur; /* * The search algorithm depends on whether we're in the same AG as the * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ if (pag_agno(pag) == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); if (error) goto error_cur; offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); if (error) goto error_cur; } /* * Modify or remove the finobt record. */ rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if (rec.ir_freecount) error = xfs_inobt_update(cur, &rec); else error = xfs_btree_delete(cur, &i); if (error) goto error_cur; /* * The finobt has now been updated appropriately. We haven't updated the * agi and superblock yet, so we can create an inobt cursor and validate * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ icur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(icur); if (error) goto error_icur; error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); if (error) goto error_icur; /* * Both trees have now been updated. We must update the perag and * superblock before we can check the freecount for each btree. */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); error = xfs_check_agi_freecount(icur); if (error) goto error_icur; error = xfs_check_agi_freecount(cur); if (error) goto error_icur; xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); *inop = ino; return 0; error_icur: xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); error_cur: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } static int xfs_dialloc_roll( struct xfs_trans **tpp, struct xfs_buf *agibp) { struct xfs_trans *tp = *tpp; struct xfs_dquot_acct *dqinfo; int error; /* * Hold to on to the agibp across the commit so no other allocation can * come in and take the free inodes we just allocated for our caller. */ xfs_trans_bhold(tp, agibp); /* * We want the quota changes to be associated with the next transaction, * NOT this one. So, detach the dqinfo from this and attach it to the * next transaction. */ dqinfo = tp->t_dqinfo; tp->t_dqinfo = NULL; error = xfs_trans_roll(&tp); /* Re-attach the quota info that we detached from prev trx. */ tp->t_dqinfo = dqinfo; /* * Join the buffer even on commit error so that the buffer is released * when the caller cancels the transaction and doesn't have to handle * this error case specially. */ xfs_trans_bjoin(tp, agibp); *tpp = tp; return error; } static bool xfs_dialloc_good_ag( struct xfs_perag *pag, struct xfs_trans *tp, umode_t mode, int flags, bool ok_alloc) { struct xfs_mount *mp = tp->t_mountp; xfs_extlen_t ineed; xfs_extlen_t longest = 0; int needspace; int error; if (!pag) return false; if (!xfs_perag_allows_inodes(pag)) return false; if (!xfs_perag_initialised_agi(pag)) { error = xfs_ialloc_read_agi(pag, tp, 0, NULL); if (error) return false; } if (pag->pagi_freecount) return true; if (!ok_alloc) return false; if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; } /* * Check that there is enough free space for the file plus a chunk of * inodes if we need to allocate some. If this is the first pass across * the AGs, take into account the potential space needed for alignment * of inode chunks when checking the longest contiguous free space in * the AG - this prevents us from getting ENOSPC because we have free * space larger than ialloc_blks but alignment constraints prevent us * from using it. * * If we can't find an AG with space for full alignment slack to be * taken into account, we must be near ENOSPC in all AGs. Hence we * don't include alignment for the second pass and so if we fail * allocation due to alignment issues then it is most likely a real * ENOSPC condition. * * XXX(dgc): this calculation is now bogus thanks to the per-ag * reservations that xfs_alloc_fix_freelist() now does via * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will * be more than large enough for the check below to succeed, but * xfs_alloc_space_available() will fail because of the non-zero * metadata reservation and hence we won't actually be able to allocate * more inodes in this AG. We do soooo much unnecessary work near ENOSPC * because of this. */ ineed = M_IGEO(mp)->ialloc_min_blks; if (flags && ineed > 1) ineed += M_IGEO(mp)->cluster_align; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); if (pag->pagf_freeblks < needspace + ineed || longest < ineed) return false; return true; } static int xfs_dialloc_try_ag( struct xfs_perag *pag, struct xfs_trans **tpp, xfs_ino_t parent, xfs_ino_t *new_ino, bool ok_alloc) { struct xfs_buf *agbp; xfs_ino_t ino; int error; /* * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp); if (error) return error; if (!pag->pagi_freecount) { if (!ok_alloc) { error = -EAGAIN; goto out_release; } error = xfs_ialloc_ag_alloc(pag, *tpp, agbp); if (error < 0) goto out_release; /* * We successfully allocated space for an inode cluster in this * AG. Roll the transaction so that we can allocate one of the * new inodes. */ ASSERT(pag->pagi_freecount > 0); error = xfs_dialloc_roll(tpp, agbp); if (error) goto out_release; } /* Allocate an inode in the found AG */ error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino); if (!error) *new_ino = ino; return error; out_release: xfs_trans_brelse(*tpp, agbp); return error; } /* * Pick an AG for the new inode. * * Directories, symlinks, and regular files frequently allocate at least one * block, so factor that potential expansion when we examine whether an AG has * enough space for file creation. Try to keep metadata files all in the same * AG. */ static inline xfs_agnumber_t xfs_dialloc_pick_ag( struct xfs_mount *mp, struct xfs_inode *dp, umode_t mode) { xfs_agnumber_t start_agno; if (!dp) return 0; if (xfs_is_metadir_inode(dp)) { if (mp->m_sb.sb_logstart) return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); return 0; } if (S_ISDIR(mode)) return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino); if (start_agno >= mp->m_maxagi) start_agno = 0; return start_agno; } /* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to * locate it. The on-disk inode that is allocated will be returned in @new_ino * on success, otherwise an error will be set to indicate the failure (e.g. * -ENOSPC). */ int xfs_dialloc( struct xfs_trans **tpp, const struct xfs_icreate_args *args, xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_ino_t ino = NULLFSINO; xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; xfs_agnumber_t agno; xfs_agnumber_t start_agno; umode_t mode = args->mode & S_IFMT; bool ok_alloc = true; bool low_space = false; int flags; int error = 0; start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode); /* * If we have already hit the ceiling of inode blocks then clear * ok_alloc so we scan all available agi structures for a free * inode. * * Read rough value of mp->m_icount by percpu_counter_read_positive, * which will sacrifice the preciseness but improve the performance. */ if (igeo->maxicount && percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > igeo->maxicount) { ok_alloc = false; } /* * If we are near to ENOSPC, we want to prefer allocation from AGs that * have free inodes in them rather than use up free space allocating new * inode chunks. Hence we turn off allocation for the first non-blocking * pass through the AGs if we are near ENOSPC to consume free inodes * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ if (percpu_counter_read_positive(&mp->m_fdblocks) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; } /* * Loop until we find an allocation group that either has free inodes * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ flags = XFS_ALLOC_FLAG_TRYLOCK; retry: for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) { if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) { error = xfs_dialloc_try_ag(pag, tpp, parent, &ino, ok_alloc); if (error != -EAGAIN) break; error = 0; } if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } } if (pag) xfs_perag_rele(pag); if (error) return error; if (ino == NULLFSINO) { if (flags) { flags = 0; if (low_space) ok_alloc = true; goto retry; } return -ENOSPC; } /* * Protect against obviously corrupt allocation btree records. Later * xfs_iget checks will catch re-allocation of other active in-memory * and on-disk inodes. If we don't catch reallocating the parent inode * here we will deadlock in xfs_iget() so we have to do these checks * first. */ if (ino == parent || !xfs_verify_dir_ino(mp, ino)) { xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } *new_ino = ino; return 0; } /* * Free the blocks of an inode chunk. We must consider that the inode chunk * might be sparse and only free the regions that are allocated as part of the * chunk. */ static int xfs_difree_inode_chunk( struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); int startidx, endidx; int nextbit; xfs_agblock_t agbno; int contigblk; DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); } /* holemask is only 16-bits (fits in an unsigned long) */ ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); holemask[0] = rec->ir_holemask; /* * Find contiguous ranges of zeroes (i.e., allocated regions) in the * holemask and convert the start/end index of each range to an extent. * We start with the start and end index both pointing at the first 0 in * the mask. */ startidx = endidx = find_first_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { int error; nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* * If the next zero bit is contiguous, update the end index of * the current range and continue. */ if (nextbit != XFS_INOBT_HOLEMASK_BITS && nextbit == endidx + 1) { endidx = nextbit; goto next; } /* * nextbit is not contiguous with the current end index. Convert * the current start/end to an extent and add it to the free * list. */ agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / mp->m_sb.sb_inopblock; contigblk = ((endidx - startidx + 1) * XFS_INODES_PER_HOLEMASK_BIT) / mp->m_sb.sb_inopblock; ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno), contigblk, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); if (error) return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; next: nextbit++; } return 0; } STATIC int xfs_difree_inobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_mount *mp = pag_mount(pag); struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; int error; int i; int off; ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); /* * Initialize the cursor. */ cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * Look for the entry describing this inode. */ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.", __func__, error); goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } /* * Get the offset in the inode chunk. */ off = agino - rec.ir_startino; ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); /* * Mark the inode free & increment the count. */ rec.ir_free |= XFS_INOBT_MASK(off); rec.ir_freecount++; /* * When an inode chunk is free, it becomes eligible for removal. Don't * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); pag->pagi_freecount -= ilen - 1; pag->pagi_count -= ilen; xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); if ((error = xfs_btree_delete(cur, &i))) { xfs_warn(mp, "%s: xfs_btree_delete returned error %d.", __func__, error); goto error0; } error = xfs_difree_inode_chunk(tp, pag, &rec); if (error) goto error0; } else { xic->deleted = false; error = xfs_inobt_update(cur, &rec); if (error) { xfs_warn(mp, "%s: xfs_inobt_update returned error %d.", __func__, error); goto error0; } /* * Change the inode free counts and log the ag/sb changes. */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } error = xfs_check_agi_freecount(cur); if (error) goto error0; *orec = rec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Free an inode in the free inode btree. */ STATIC int xfs_difree_finobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; if (i == 0) { /* * If the record does not exist in the finobt, we must have just * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, ibtrec->ir_count, ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; ASSERT(i == 1); goto out; } /* * Read and update the existing record. We could just copy the ibtrec * across here, but that would defeat the purpose of having redundant * metadata. By making the modifications independently, we can catch * corruptions that we wouldn't see if we just copied from one record * to another. */ error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; if (XFS_IS_CORRUPT(mp, rec.ir_free != ibtrec->ir_free || rec.ir_freecount != ibtrec->ir_freecount)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } /* * The content of inobt records should always match between the inobt * and finobt. The lifecycle of records in the finobt is different from * the inobt in that the finobt only tracks records with at least one * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. * * Note that we currently can't free chunks when the block size is large * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) goto error; ASSERT(i == 1); } else { error = xfs_inobt_update(cur, &rec); if (error) goto error; } out: error = xfs_check_agi_freecount(cur); if (error) goto error; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Free disk inode. Carefully avoids touching the incore inode, all * manipulations incore are the caller's responsibility. * The on-disk inode is not changed by this operation, only the * btree (free inode mask) is changed. */ int xfs_difree( struct xfs_trans *tp, struct xfs_perag *pag, xfs_ino_t inode, struct xfs_icluster *xic) { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ struct xfs_buf *agbp; /* buffer for allocation group header */ xfs_agino_t agino; /* allocation group inode number */ int error; /* error return value */ struct xfs_mount *mp = tp->t_mountp; struct xfs_inobt_rec_incore rec;/* btree record */ /* * Break up inode number into its components. */ if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) { xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).", __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag)); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); if (inode != xfs_agino_to_ino(pag, agino)) { xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).", __func__, (unsigned long long)inode, (unsigned long long)xfs_agino_to_ino(pag, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).", __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag))); ASSERT(0); return -EINVAL; } /* * Get the allocation group header. */ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); return error; } /* * Fix up the inode allocation btree. */ error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec); if (error) goto error0; /* * Fix up the free inode btree. */ if (xfs_has_finobt(mp)) { error = xfs_difree_finobt(pag, tp, agbp, agino, &rec); if (error) goto error0; } return 0; error0: return error; } STATIC int xfs_imap_lookup( struct xfs_perag *pag, struct xfs_trans *tp, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, xfs_agblock_t *offset_agbno, int flags) { struct xfs_mount *mp = pag_mount(pag); struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", __func__, error, pag_agno(pag)); return error; } /* * Lookup the inode record for the given agino. If the record cannot be * found, then it's an invalid inode number and we should abort. Once * we have a record, we need to ensure it contains the inode number * we are looking up. */ cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); if (!error && i == 0) error = -EINVAL; } xfs_trans_brelse(tp, agbp); xfs_btree_del_cursor(cur, error); if (error) return error; /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino) return -EINVAL; /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) return -EINVAL; *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); *offset_agbno = agbno - *chunk_agbno; return 0; } /* * Return the location of the inode in imap, for mapping it into a buffer. */ int xfs_imap( struct xfs_perag *pag, struct xfs_trans *tp, xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { struct xfs_mount *mp = pag_mount(pag); xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ xfs_agblock_t cluster_agbno; /* first block in inode cluster */ int error; /* error code */ int offset; /* index of inode in its buffer */ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) || ino != xfs_agino_to_ino(pag, agino)) { error = -EINVAL; #ifdef DEBUG /* * Don't output diagnostic information for untrusted inodes * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) return error; if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, (unsigned long)xfs_ag_block_count(mp, pag_agno(pag))); } if (ino != xfs_agino_to_ino(pag, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)", __func__, ino, xfs_agino_to_ino(pag, agino)); } xfs_stack_trace(); #endif /* DEBUG */ return error; } /* * For bulkstat and handle lookups, we have an untrusted inode number * that we have to verify is valid. We cannot do this just by reading * the inode buffer as it may have been unlinked and removed leaving * inodes in stale state on disk. Hence we have to do a btree lookup * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) return error; goto out_map; } /* * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ if (M_IGEO(mp)->blocks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); imap->im_blkno = xfs_agbno_to_daddr(pag, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); return 0; } /* * If the inode chunks are aligned then use simple maths to * find the location. Otherwise we have to do a btree * lookup to find the location. */ if (M_IGEO(mp)->inoalign_mask) { offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) return error; } out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) * M_IGEO(mp)->blocks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds * of the file system then return NULL rather than calling * read_buf and panicing when we get an error from the * driver. */ if ((imap->im_blkno + imap->im_len) > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { xfs_alert(mp, "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)", __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); return -EINVAL; } return 0; } /* * Log specified fields for the ag hdr (inode section). The growth of the agi * structure over time requires that we interpret the buffer as two logical * regions delineated by the end of the unlinked list. This is due to the size * of the hash table and its location in the middle of the agi. * * For example, a request to log a field before agi_unlinked and a field after * agi_unlinked could cause us to log the entire hash table and use an excessive * amount of log space. To avoid this behavior, log the region up through * agi_unlinked in one call and the region after agi_unlinked through the end of * the structure in another. */ void xfs_ialloc_log_agi( struct xfs_trans *tp, struct xfs_buf *bp, uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ static const short offsets[] = { /* field starting offsets */ /* keep in sync with bit definitions */ offsetof(xfs_agi_t, agi_magicnum), offsetof(xfs_agi_t, agi_versionnum), offsetof(xfs_agi_t, agi_seqno), offsetof(xfs_agi_t, agi_length), offsetof(xfs_agi_t, agi_count), offsetof(xfs_agi_t, agi_root), offsetof(xfs_agi_t, agi_level), offsetof(xfs_agi_t, agi_freecount), offsetof(xfs_agi_t, agi_newino), offsetof(xfs_agi_t, agi_dirino), offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG struct xfs_agi *agi = bp->b_addr; ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through * agi_unlinked. */ if (fields & XFS_AGI_ALL_BITS_R1) { xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, &first, &last); xfs_trans_log_buf(tp, bp, first, last); } /* * Mask off the bits in the first region and calculate the first and * last field offsets for any bits in the second region. */ fields &= ~XFS_AGI_ALL_BITS_R1; if (fields) { xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, &first, &last); xfs_trans_log_buf(tp, bp, first, last); } } static xfs_failaddr_t xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno); uint32_t agi_length = be32_to_cpu(agi->agi_length); int i; if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } /* * Validate the magic number of the agi block. */ if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; fa = xfs_validate_ag_length(bp, agi_seqno, agi_length); if (fa) return fa; if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; if (xfs_has_finobt(mp) && (be32_to_cpu(agi->agi_free_level) < 1 || be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO)) continue; if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i]))) return __this_address; } return NULL; } static void xfs_agi_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void xfs_agi_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (!xfs_has_crc(mp)) return; if (bip) agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, }; /* * Read in the allocation group header (inode allocation section) */ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags, struct xfs_buf **agibpp) { struct xfs_mount *mp = pag_mount(pag); int error; trace_xfs_read_agi(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) return error; if (tp) xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*agibpp, XFS_AGI_REF); return 0; } /* * Read in the agi and initialise the per-ag data. If the caller supplies a * @agibpp, return the locked AGI buffer to them, otherwise release it. */ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agibpp) { struct xfs_buf *agibp; struct xfs_agi *agi; int error; trace_xfs_ialloc_read_agi(pag); error = xfs_read_agi(pag, tp, (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, &agibp); if (error) return error; agi = agibp->b_addr; if (!xfs_perag_initialised_agi(pag)) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } /* * It's possible for these to be out of sync if * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || xfs_is_shutdown(pag_mount(pag))); if (agibpp) *agibpp = agibp; else xfs_trans_brelse(tp, agibp); return 0; } /* How many inodes are backed by inode clusters ondisk? */ STATIC int xfs_ialloc_count_ondisk( struct xfs_btree_cur *cur, xfs_agino_t low, xfs_agino_t high, unsigned int *allocated) { struct xfs_inobt_rec_incore irec; unsigned int ret = 0; int has_record; int error; error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); if (error) return error; while (has_record) { unsigned int i, hole_idx; error = xfs_inobt_get_rec(cur, &irec, &has_record); if (error) return error; if (irec.ir_startino > high) break; for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { if (irec.ir_startino + i < low) continue; if (irec.ir_startino + i > high) break; hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT; if (!(irec.ir_holemask & (1U << hole_idx))) ret++; } error = xfs_btree_increment(cur, 0, &has_record); if (error) return error; } *allocated = ret; return 0; } /* Is there an inode record covering a given extent? */ int xfs_ialloc_has_inodes_at_extent( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, enum xbtree_recpacking *outcome) { xfs_agino_t agino; xfs_agino_t last_agino; unsigned int allocated; int error; agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno); last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated); if (error) return error; if (allocated == 0) *outcome = XBTREE_RECPACKING_EMPTY; else if (allocated == last_agino - agino + 1) *outcome = XBTREE_RECPACKING_FULL; else *outcome = XBTREE_RECPACKING_SPARSE; return 0; } struct xfs_ialloc_count_inodes { xfs_agino_t count; xfs_agino_t freecount; }; /* Record inode counts across all inobt records. */ STATIC int xfs_ialloc_count_inodes_rec( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { struct xfs_inobt_rec_incore irec; struct xfs_ialloc_count_inodes *ci = priv; xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); ci->count += irec.ir_count; ci->freecount += irec.ir_freecount; return 0; } /* Count allocated and free inodes under an inobt. */ int xfs_ialloc_count_inodes( struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount) { struct xfs_ialloc_count_inodes ci = {0}; int error; ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); if (error) return error; *count = ci.count; *freecount = ci.freecount; return 0; } /* * Initialize inode-related geometry information. * * Compute the inode btree min and max levels and set maxicount. * * Set the inode cluster size. This may still be overridden by the file * system block size if it is larger than the chosen cluster size. * * For v5 filesystems, scale the cluster size with the inode size to keep a * constant ratio of inode per cluster buffer, but only if mkfs has set the * inode alignment value appropriately for larger cluster sizes. * * Then compute the inode cluster alignment information. */ void xfs_ialloc_setup_geometry( struct xfs_mount *mp) { struct xfs_sb *sbp = &mp->m_sb; struct xfs_ino_geometry *igeo = M_IGEO(mp); uint64_t icount; uint inodes; igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; if (xfs_has_large_extent_counts(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true); igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false); igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK, sbp->sb_inopblock); igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog; if (sbp->sb_spino_align) igeo->ialloc_min_blks = sbp->sb_spino_align; else igeo->ialloc_min_blks = igeo->ialloc_blks; /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk()); /* * Set the maximum inode count for this filesystem, being careful not * to use obviously garbage sb_inopblog/sb_inopblock values. Regular * users should never get here due to failing sb verification, but * certain users (xfs_db) need to be usable even with corrupt metadata. */ if (sbp->sb_imax_pct && igeo->ialloc_blks) { /* * Make sure the maximum inode count is a multiple * of the units we allocate inodes in. */ icount = sbp->sb_dblocks * sbp->sb_imax_pct; do_div(icount, 100); do_div(icount, igeo->ialloc_blks); igeo->maxicount = XFS_FSB_TO_INO(mp, icount * igeo->ialloc_blks); } else { igeo->maxicount = 0; } /* * Compute the desired size of an inode cluster buffer size, which * starts at 8K and (on v5 filesystems) scales up with larger inode * sizes. * * Preserve the desired inode cluster size because the sparse inodes * feature uses that desired size (not the actual size) to compute the * sparse inode alignment. The mount code validates this value, so we * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; if (xfs_has_v3inodes(mp)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) igeo->inode_cluster_size_raw = new_size; } /* Calculate inode cluster ratios. */ if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize) igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw); else igeo->blocks_per_cluster = 1; igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster); igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster); /* Calculate inode cluster alignment. */ if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster) igeo->cluster_align = mp->m_sb.sb_inoalignmt; else igeo->cluster_align = 1; igeo->inoalign_mask = igeo->cluster_align - 1; igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align); /* * If we are using stripe alignment, check whether * the stripe unit is a multiple of the inode alignment */ if (mp->m_dalign && igeo->inoalign_mask && !(mp->m_dalign & igeo->inoalign_mask)) igeo->ialloc_align = mp->m_dalign; else igeo->ialloc_align = 0; if (mp->m_sb.sb_blocksize > PAGE_SIZE) igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT; else igeo->min_folio_order = 0; } /* Compute the location of the root directory inode that is laid out by mkfs. */ xfs_ino_t xfs_ialloc_calc_rootino( struct xfs_mount *mp, int sunit) { struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agblock_t first_bno; /* * Pre-calculate the geometry of AG 0. We know what it looks like * because libxfs knows how to create allocation groups now. * * first_bno is the first block in which mkfs could possibly have * allocated the root directory inode, once we factor in the metadata * that mkfs formats before it. Namely, the four AG headers... */ first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); /* ...the two free space btree roots... */ first_bno += 2; /* ...the inode btree root... */ first_bno += 1; /* ...the initial AGFL... */ first_bno += xfs_alloc_min_freelist(mp, NULL); /* ...the free inode btree root... */ if (xfs_has_finobt(mp)) first_bno++; /* ...the reverse mapping btree root... */ if (xfs_has_rmapbt(mp)) first_bno++; /* ...the reference count btree... */ if (xfs_has_reflink(mp)) first_bno++; /* * ...and the log, if it is allocated in the first allocation group. * * This can happen with filesystems that only have a single * allocation group, or very odd geometries created by old mkfs * versions on very small filesystems. */ if (xfs_ag_contains_log(mp, 0)) first_bno += mp->m_sb.sb_logblocks; /* * Now round first_bno up to whatever allocation alignment is given * by the filesystem or was passed in. */ if (xfs_has_dalign(mp) && igeo->ialloc_align > 0) first_bno = roundup(first_bno, sunit); else if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt > 1) first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); } /* * Ensure there are not sparse inode clusters that cross the new EOAG. * * This is a no-op for non-spinode filesystems since clusters are always fully * allocated and checking the bnobt suffices. However, a spinode filesystem * could have a record where the upper inodes are free blocks. If those blocks * were removed from the filesystem, the inode record would extend beyond EOAG, * which will be flagged as corruption. */ int xfs_ialloc_check_shrink( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agibp, xfs_agblock_t new_length) { struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; xfs_agino_t agino; int has; int error; if (!xfs_has_sparseinodes(pag_mount(pag))) return 0; cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; error = xfs_inobt_get_rec(cur, &rec, &has); if (error) goto out; if (!has) { xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT); error = -EFSCORRUPTED; goto out; } /* If the record covers inodes that would be beyond EOFS, bail out. */ if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) { error = -ENOSPC; goto out; } out: xfs_btree_del_cursor(cur, error); return error; }
9 9 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Framing Layer. * * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/crc-ccitt.h> #include <linux/netdevice.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cffrml.h> #define container_obj(layr) container_of(layr, struct cffrml, layer) struct cffrml { struct cflayer layer; bool dofcs; /* !< FCS active */ int __percpu *pcpu_refcnt; }; static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt); static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); static u32 cffrml_rcv_error; static u32 cffrml_rcv_checsum_error; struct cflayer *cffrml_create(u16 phyid, bool use_fcs) { struct cffrml *this = kzalloc(sizeof(struct cffrml), GFP_ATOMIC); if (!this) return NULL; this->pcpu_refcnt = alloc_percpu(int); if (this->pcpu_refcnt == NULL) { kfree(this); return NULL; } caif_assert(offsetof(struct cffrml, layer) == 0); this->layer.receive = cffrml_receive; this->layer.transmit = cffrml_transmit; this->layer.ctrlcmd = cffrml_ctrlcmd; snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid); this->dofcs = use_fcs; this->layer.id = phyid; return (struct cflayer *) this; } void cffrml_free(struct cflayer *layer) { struct cffrml *this = container_obj(layer); free_percpu(this->pcpu_refcnt); kfree(layer); } void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up) { this->up = up; } void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn) { this->dn = dn; } static u16 cffrml_checksum(u16 chks, void *buf, u16 len) { /* FIXME: FCS should be moved to glue in order to use OS-Specific * solutions */ return crc_ccitt(chks, buf, len); } static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) { u16 tmp; u16 len; u16 hdrchks; int pktchks; struct cffrml *this; this = container_obj(layr); cfpkt_extr_head(pkt, &tmp, 2); len = le16_to_cpu(tmp); /* Subtract for FCS on length if FCS is not used. */ if (!this->dofcs) len -= 2; if (cfpkt_setlen(pkt, len) < 0) { ++cffrml_rcv_error; pr_err("Framing length error (%d)\n", len); cfpkt_destroy(pkt); return -EPROTO; } /* * Don't do extract if FCS is false, rather do setlen - then we don't * get a cache-miss. */ if (this->dofcs) { cfpkt_extr_trail(pkt, &tmp, 2); hdrchks = le16_to_cpu(tmp); pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); if (pktchks != hdrchks) { cfpkt_add_trail(pkt, &tmp, 2); ++cffrml_rcv_error; ++cffrml_rcv_checsum_error; pr_info("Frame checksum error (0x%x != 0x%x)\n", hdrchks, pktchks); return -EILSEQ; } } if (cfpkt_erroneous(pkt)) { ++cffrml_rcv_error; pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->up == NULL) { pr_err("Layr up is missing!\n"); cfpkt_destroy(pkt); return -EINVAL; } return layr->up->receive(layr->up, pkt); } static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) { u16 chks; u16 len; __le16 data; struct cffrml *this = container_obj(layr); if (this->dofcs) { chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); data = cpu_to_le16(chks); cfpkt_add_trail(pkt, &data, 2); } else { cfpkt_pad_trail(pkt, 2); } len = cfpkt_getlen(pkt); data = cpu_to_le16(len); cfpkt_add_head(pkt, &data, 2); cfpkt_info(pkt)->hdr_len += 2; if (cfpkt_erroneous(pkt)) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->dn == NULL) { cfpkt_destroy(pkt); return -ENODEV; } return layr->dn->transmit(layr->dn, pkt); } static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); } void cffrml_put(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_dec(*this->pcpu_refcnt); } void cffrml_hold(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_inc(*this->pcpu_refcnt); } int cffrml_refcnt_read(struct cflayer *layr) { int i, refcnt = 0; struct cffrml *this = container_obj(layr); for_each_possible_cpu(i) refcnt += *per_cpu_ptr(this->pcpu_refcnt, i); return refcnt; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_EC_H #define _BCACHEFS_EC_H #include "ec_types.h" #include "buckets_types.h" #include "extents_types.h" int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); #define bch2_bkey_ops_stripe ((struct bkey_ops) { \ .key_validate = bch2_stripe_validate, \ .val_to_text = bch2_stripe_to_text, \ .swab = bch2_ptr_swab, \ .trigger = bch2_trigger_stripe, \ .min_val_size = 8, \ }) static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) { return DIV_ROUND_UP(le16_to_cpu(s->sectors), 1 << s->csum_granularity_bits); } static inline unsigned stripe_csum_offset(const struct bch_stripe *s, unsigned dev, unsigned csum_idx) { EBUG_ON(s->csum_type >= BCH_CSUM_NR); unsigned csum_bytes = bch_crc_bytes[s->csum_type]; return sizeof(struct bch_stripe) + sizeof(struct bch_extent_ptr) * s->nr_blocks + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; } static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, unsigned idx) { return stripe_csum_offset(s, s->nr_blocks, 0) + sizeof(u16) * idx; } static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, unsigned idx) { return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); } static inline void stripe_blockcount_set(struct bch_stripe *s, unsigned idx, unsigned v) { __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); *p = cpu_to_le16(v); } static inline unsigned stripe_val_u64s(const struct bch_stripe *s) { return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), sizeof(u64)); } static inline void *stripe_csum(struct bch_stripe *s, unsigned block, unsigned csum_idx) { EBUG_ON(block >= s->nr_blocks); EBUG_ON(csum_idx >= stripe_csums_per_device(s)); return (void *) s + stripe_csum_offset(s, block, csum_idx); } static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, unsigned block, unsigned csum_idx) { struct bch_csum csum = { 0 }; memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); return csum; } static inline void stripe_csum_set(struct bch_stripe *s, unsigned block, unsigned csum_idx, struct bch_csum csum) { memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); } static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, const struct bch_extent_ptr *data_ptr, unsigned sectors) { return (data_ptr->dev == stripe_ptr->dev || data_ptr->dev == BCH_SB_MEMBER_INVALID || stripe_ptr->dev == BCH_SB_MEMBER_INVALID) && data_ptr->gen == stripe_ptr->gen && data_ptr->offset >= stripe_ptr->offset && data_ptr->offset < stripe_ptr->offset + sectors; } static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, struct extent_ptr_decoded p) { unsigned nr_data = s->nr_blocks - s->nr_redundant; BUG_ON(!p.has_ec); if (p.ec.block >= nr_data) return false; return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, le16_to_cpu(s->sectors)); } static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, struct extent_ptr_decoded p) { unsigned nr_data = m->nr_blocks - m->nr_redundant; BUG_ON(!p.has_ec); if (p.ec.block >= nr_data) return false; return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, m->sectors); } struct bch_read_bio; struct ec_stripe_buf { /* might not be buffering the entire stripe: */ unsigned offset; unsigned size; unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; void *data[BCH_BKEY_PTRS_MAX]; __BKEY_PADDED(key, 255); }; struct ec_stripe_head; enum ec_stripe_ref { STRIPE_REF_io, STRIPE_REF_stripe, STRIPE_REF_NR }; struct ec_stripe_new { struct bch_fs *c; struct ec_stripe_head *h; struct mutex lock; struct list_head list; struct hlist_node hash; u64 idx; struct closure iodone; atomic_t ref[STRIPE_REF_NR]; int err; u8 nr_data; u8 nr_parity; bool allocated; bool pending; bool have_existing_stripe; unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; struct disk_reservation res; struct ec_stripe_buf new_stripe; struct ec_stripe_buf existing_stripe; }; struct ec_stripe_head { struct list_head list; struct mutex lock; unsigned disk_label; unsigned algo; unsigned redundancy; enum bch_watermark watermark; bool insufficient_devs; unsigned long rw_devs_change_count; u64 nr_created; struct bch_devs_mask devs; unsigned nr_active_devs; unsigned blocksize; struct dev_stripe_state block_stripe; struct dev_stripe_state parity_stripe; struct ec_stripe_new *s; }; int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, unsigned, unsigned, unsigned, enum bch_watermark, struct closure *); void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); static inline void ec_stripe_new_get(struct ec_stripe_new *s, enum ec_stripe_ref ref) { atomic_inc(&s->ref[ref]); } static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, enum ec_stripe_ref ref) { BUG_ON(atomic_read(&s->ref[ref]) <= 0); if (atomic_dec_and_test(&s->ref[ref])) switch (ref) { case STRIPE_REF_stripe: bch2_ec_stripe_new_free(c, s); break; case STRIPE_REF_io: bch2_ec_do_stripe_creates(c); break; default: BUG(); } } int bch2_dev_remove_stripes(struct bch_fs *, unsigned); void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); void bch2_fs_ec_init_early(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); #endif /* _BCACHEFS_EC_H */
2 2 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2020 Facebook */ #include <linux/fs.h> #include <linux/anon_inodes.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/rcupdate_trace.h> struct bpf_iter_target_info { struct list_head list; const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; struct bpf_iter_link { struct bpf_link link; struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; bool done_stop; u8 target_private[] __aligned(8); }; static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->seq_num++; } static void bpf_iter_dec_seq_num(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->seq_num--; } static void bpf_iter_done_stop(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->done_stop = true; } static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo) { return tinfo->reg_info->feature & BPF_ITER_RESCHED; } static bool bpf_iter_support_resched(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); return bpf_iter_target_support_resched(iter_priv->tinfo); } /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 /* bpf_seq_read, a customized and simpler version for bpf iterator. * The following are differences from seq_read(): * . fixed buffer size (PAGE_SIZE) * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; int err = 0, num_objs = 0; bool can_resched; void *p; mutex_lock(&seq->lock); if (!seq->buf) { seq->size = PAGE_SIZE << 3; seq->buf = kvmalloc(seq->size, GFP_KERNEL); if (!seq->buf) { err = -ENOMEM; goto done; } } if (seq->count) { n = min(seq->count, size); err = copy_to_user(buf, seq->buf + seq->from, n); if (err) { err = -EFAULT; goto done; } seq->count -= n; seq->from += n; copied = n; goto done; } seq->from = 0; p = seq->op->start(seq, &seq->index); if (!p) goto stop; if (IS_ERR(p)) { err = PTR_ERR(p); seq->op->stop(seq, p); seq->count = 0; goto done; } err = seq->op->show(seq, p); if (err > 0) { /* object is skipped, decrease seq_num, so next * valid object can reuse the same seq_num. */ bpf_iter_dec_seq_num(seq); seq->count = 0; } else if (err < 0 || seq_has_overflowed(seq)) { if (!err) err = -E2BIG; seq->op->stop(seq, p); seq->count = 0; goto done; } can_resched = bpf_iter_support_resched(seq); while (1) { loff_t pos = seq->index; num_objs++; offs = seq->count; p = seq->op->next(seq, p, &seq->index); if (pos == seq->index) { pr_info_ratelimited("buggy seq_file .next function %ps " "did not updated position index\n", seq->op->next); seq->index++; } if (IS_ERR_OR_NULL(p)) break; /* got a valid next object, increase seq_num */ bpf_iter_inc_seq_num(seq); if (seq->count >= size) break; if (num_objs >= MAX_ITER_OBJECTS) { if (offs == 0) { err = -EAGAIN; seq->op->stop(seq, p); goto done; } break; } err = seq->op->show(seq, p); if (err > 0) { bpf_iter_dec_seq_num(seq); seq->count = offs; } else if (err < 0 || seq_has_overflowed(seq)) { seq->count = offs; if (offs == 0) { if (!err) err = -E2BIG; seq->op->stop(seq, p); goto done; } break; } if (can_resched) cond_resched(); } stop: offs = seq->count; if (IS_ERR(p)) { seq->op->stop(seq, NULL); err = PTR_ERR(p); goto done; } /* bpf program called if !p */ seq->op->stop(seq, p); if (!p) { if (!seq_has_overflowed(seq)) { bpf_iter_done_stop(seq); } else { seq->count = offs; if (offs == 0) { err = -E2BIG; goto done; } } } n = min(seq->count, size); err = copy_to_user(buf, seq->buf, n); if (err) { err = -EFAULT; goto done; } copied = n; seq->count -= n; seq->from = n; done: if (!copied) copied = err; else *ppos += copied; mutex_unlock(&seq->lock); return copied; } static const struct bpf_iter_seq_info * __get_seq_info(struct bpf_iter_link *link) { const struct bpf_iter_seq_info *seq_info; if (link->aux.map) { seq_info = link->aux.map->ops->iter_seq_info; if (seq_info) return seq_info; } return link->tinfo->reg_info->seq_info; } static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) { struct bpf_iter_priv_data *iter_priv; struct seq_file *seq; seq = file->private_data; if (!seq) return 0; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); if (iter_priv->seq_info->fini_seq_private) iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; return seq_release_private(inode, file); } const struct file_operations bpf_iter_fops = { .open = iter_open, .read = bpf_seq_read, .release = iter_release, }; /* The argument reg_info will be cached in bpf_iter_target_info. * The common practice is to declare target reg_info as * a const static variable and passed as an argument to * bpf_iter_reg_target(). */ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); list_add(&tinfo->list, &targets); mutex_unlock(&targets_mutex); return 0; } void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; break; } } mutex_unlock(&targets_mutex); WARN_ON(found == false); } static void cache_btf_id(struct bpf_iter_target_info *tinfo, struct bpf_prog *prog) { tinfo->btf_id = prog->aux->attach_btf_id; } bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; struct bpf_iter_target_info *tinfo = NULL, *iter; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; int prefix_len = strlen(prefix); if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { if (iter->btf_id && iter->btf_id == prog_btf_id) { tinfo = iter; break; } if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) { cache_btf_id(iter, prog); tinfo = iter; break; } } mutex_unlock(&targets_mutex); if (tinfo) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } return tinfo != NULL; } const struct bpf_func_proto * bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_iter_target_info *tinfo; const struct bpf_func_proto *fn = NULL; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (tinfo->btf_id == prog->aux->attach_btf_id) { const struct bpf_iter_reg *reg_info; reg_info = tinfo->reg_info; if (reg_info->get_func_proto) fn = reg_info->get_func_proto(func_id, prog); break; } } mutex_unlock(&targets_mutex); return fn; } static void bpf_iter_link_release(struct bpf_link *link) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); if (iter_link->tinfo->reg_info->detach_target) iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); kfree(iter_link); } static int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { int ret = 0; mutex_lock(&link_mutex); if (old_prog && link->prog != old_prog) { ret = -EPERM; goto out_unlock; } if (link->prog->type != new_prog->type || link->prog->expected_attach_type != new_prog->expected_attach_type || link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { ret = -EINVAL; goto out_unlock; } old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: mutex_unlock(&link_mutex); return ret; } static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); bpf_iter_show_fdinfo_t show_fdinfo; seq_printf(seq, "target_name:\t%s\n", iter_link->tinfo->reg_info->target); show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; if (show_fdinfo) show_fdinfo(&iter_link->aux, seq); } static int bpf_iter_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); char __user *ubuf = u64_to_user_ptr(info->iter.target_name); bpf_iter_fill_link_info_t fill_link_info; u32 ulen = info->iter.target_name_len; const char *target_name; u32 target_len; if (!ulen ^ !ubuf) return -EINVAL; target_name = iter_link->tinfo->reg_info->target; target_len = strlen(target_name); info->iter.target_name_len = target_len + 1; if (ubuf) { if (ulen >= target_len + 1) { if (copy_to_user(ubuf, target_name, target_len + 1)) return -EFAULT; } else { char zero = '\0'; if (copy_to_user(ubuf, target_name, ulen - 1)) return -EFAULT; if (put_user(zero, ubuf + ulen - 1)) return -EFAULT; return -ENOSPC; } } fill_link_info = iter_link->tinfo->reg_info->fill_link_info; if (fill_link_info) return fill_link_info(&iter_link->aux, info); return 0; } static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, .show_fdinfo = bpf_iter_link_show_fdinfo, .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) { return link->ops == &bpf_iter_link_lops; } int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { struct bpf_iter_target_info *tinfo = NULL, *iter; struct bpf_link_primer link_primer; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; bpfptr_t ulinfo; int err; if (attr->link_create.target_fd || attr->link_create.flags) return -EINVAL; memset(&linfo, 0, sizeof(union bpf_iter_link_info)); ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel); linfo_len = attr->link_create.iter_info_len; if (bpfptr_is_null(ulinfo) ^ !linfo_len) return -EINVAL; if (!bpfptr_is_null(ulinfo)) { err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), linfo_len); if (err) return err; linfo_len = min_t(u32, linfo_len, sizeof(linfo)); if (copy_from_bpfptr(&linfo, ulinfo, linfo_len)) return -EFAULT; } prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { if (iter->btf_id == prog_btf_id) { tinfo = iter; break; } } mutex_unlock(&targets_mutex); if (!tinfo) return -ENOENT; /* Only allow sleepable program for resched-able iterator */ if (prog->sleepable && !bpf_iter_target_support_resched(tinfo)) return -EINVAL; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); link->tinfo = tinfo; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); return err; } if (tinfo->reg_info->attach_target) { err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { bpf_link_cleanup(&link_primer); return err; } } return bpf_link_settle(&link_primer); } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; struct bpf_prog *prog; u32 total_priv_dsize; struct seq_file *seq; int err = 0; mutex_lock(&link_mutex); prog = link->link.prog; bpf_prog_inc(prog); mutex_unlock(&link_mutex); tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + seq_info->seq_priv_size; priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } if (seq_info->init_seq_private) { err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; return 0; release_seq_file: seq_release_private(file->f_inode, file); file->private_data = NULL; release_prog: bpf_prog_put(prog); return err; } int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; fd = get_unused_fd_flags(flags); if (fd < 0) return fd; file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto free_fd; } iter_link = container_of(link, struct bpf_iter_link, link); err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; fd_install(fd, file); return fd; free_file: fput(file); free_fd: put_unused_fd(fd); return err; } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) { struct bpf_iter_priv_data *iter_priv; struct seq_file *seq; void *seq_priv; seq = meta->seq; if (seq->file->f_op != &bpf_iter_fops) return NULL; seq_priv = seq->private; iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, target_private); if (in_stop && iter_priv->done_stop) return NULL; meta->session_id = iter_priv->session_id; meta->seq_num = iter_priv->seq_num; return iter_priv->prog; } int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; if (prog->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock_trace(); } else { rcu_read_lock(); migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock(); } /* bpf program can only return 0 or 1: * 0 : okay * 1 : retry the same object * The bpf_iter_run_prog() return value * will be seq_ops->show() return value. */ return ret == 0 ? 0 : -EAGAIN; } BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, void *, callback_ctx, u64, flags) { return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); } const struct bpf_func_proto bpf_for_each_map_elem_proto = { .func = bpf_for_each_map_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64, flags) { bpf_callback_t callback = (bpf_callback_t)callback_fn; u64 ret; u32 i; /* Note: these safety checks are also verified when bpf_loop * is inlined, be careful to modify this code in sync. See * function verifier.c:inline_bpf_loop. */ if (flags) return -EINVAL; if (nr_loops > BPF_MAX_LOOPS) return -E2BIG; for (i = 0; i < nr_loops; i++) { ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) return i + 1; } return i; } const struct bpf_func_proto bpf_loop_proto = { .func = bpf_loop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; struct bpf_iter_num_kern { int cur; /* current value, inclusive */ int end; /* final value, exclusive */ } __aligned(8); __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) { struct bpf_iter_num_kern *s = (void *)it; BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num)); BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num)); /* start == end is legit, it's an empty range and we'll just get NULL * on first (and any subsequent) bpf_iter_num_next() call */ if (start > end) { s->cur = s->end = 0; return -EINVAL; } /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */ if ((s64)end - (s64)start > BPF_MAX_LOOPS) { s->cur = s->end = 0; return -E2BIG; } /* user will call bpf_iter_num_next() first, * which will set s->cur to exactly start value; * underflow shouldn't matter */ s->cur = start - 1; s->end = end; return 0; } __bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it) { struct bpf_iter_num_kern *s = (void *)it; /* check failed initialization or if we are done (same behavior); * need to be careful about overflow, so convert to s64 for checks, * e.g., if s->cur == s->end == INT_MAX, we can't just do * s->cur + 1 >= s->end */ if ((s64)(s->cur + 1) >= s->end) { s->cur = s->end = 0; return NULL; } s->cur++; return &s->cur; } __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) { struct bpf_iter_num_kern *s = (void *)it; s->cur = s->end = 0; } __bpf_kfunc_end_defs();
207 207 122 124 4 3 8 15 15 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * Digital Audio (PCM) abstract layer * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/time.h> #include <linux/gcd.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/timer.h> #include "pcm_local.h" /* * Timer functions */ void snd_pcm_timer_resolution_change(struct snd_pcm_substream *substream) { unsigned long rate, mult, fsize, l, post; struct snd_pcm_runtime *runtime = substream->runtime; mult = 1000000000; rate = runtime->rate; if (snd_BUG_ON(!rate)) return; l = gcd(mult, rate); mult /= l; rate /= l; fsize = runtime->period_size; if (snd_BUG_ON(!fsize)) return; l = gcd(rate, fsize); rate /= l; fsize /= l; post = 1; while ((mult * fsize) / fsize != mult) { mult /= 2; post *= 2; } if (rate == 0) { pcm_err(substream->pcm, "pcm timer resolution out of range (rate = %u, period_size = %lu)\n", runtime->rate, runtime->period_size); runtime->timer_resolution = -1; return; } runtime->timer_resolution = (mult * fsize / rate) * post; } static unsigned long snd_pcm_timer_resolution(struct snd_timer * timer) { struct snd_pcm_substream *substream; substream = timer->private_data; return substream->runtime ? substream->runtime->timer_resolution : 0; } static int snd_pcm_timer_start(struct snd_timer * timer) { struct snd_pcm_substream *substream; substream = snd_timer_chip(timer); substream->timer_running = 1; return 0; } static int snd_pcm_timer_stop(struct snd_timer * timer) { struct snd_pcm_substream *substream; substream = snd_timer_chip(timer); substream->timer_running = 0; return 0; } static const struct snd_timer_hardware snd_pcm_timer = { .flags = SNDRV_TIMER_HW_AUTO | SNDRV_TIMER_HW_SLAVE, .resolution = 0, .ticks = 1, .c_resolution = snd_pcm_timer_resolution, .start = snd_pcm_timer_start, .stop = snd_pcm_timer_stop, }; /* * Init functions */ static void snd_pcm_timer_free(struct snd_timer *timer) { struct snd_pcm_substream *substream = timer->private_data; substream->timer = NULL; } void snd_pcm_timer_init(struct snd_pcm_substream *substream) { struct snd_timer_id tid; struct snd_timer *timer; tid.dev_sclass = SNDRV_TIMER_SCLASS_NONE; tid.dev_class = SNDRV_TIMER_CLASS_PCM; tid.card = substream->pcm->card->number; tid.device = substream->pcm->device; tid.subdevice = (substream->number << 1) | (substream->stream & 1); if (snd_timer_new(substream->pcm->card, "PCM", &tid, &timer) < 0) return; sprintf(timer->name, "PCM %s %i-%i-%i", snd_pcm_direction_name(substream->stream), tid.card, tid.device, tid.subdevice); timer->hw = snd_pcm_timer; if (snd_device_register(timer->card, timer) < 0) { snd_device_free(timer->card, timer); return; } timer->private_data = substream; timer->private_free = snd_pcm_timer_free; substream->timer = timer; } void snd_pcm_timer_done(struct snd_pcm_substream *substream) { if (substream->timer) { snd_device_free(substream->pcm->card, substream->timer); substream->timer = NULL; } }
5 5 10 10 13 3 1 10 10 8 10 10 10 9 9 9 9 13 1 12 4 4 3 4 1 1 1 1 1 1 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, Intel Corporation. * * Author: Alexander Duyck <alexander.h.duyck@intel.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct multiq_sched_data { u16 bands; u16 max_bands; u16 curband; struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc **queues; }; static struct Qdisc * multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct multiq_sched_data *q = qdisc_priv(sch); u32 band; struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(q->filter_list); int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif band = skb_get_queue_mapping(skb); if (band >= q->bands) return q->queues[0]; return q->queues[band]; } static int multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct Qdisc *qdisc; int ret; qdisc = multiq_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *multiq_dequeue(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ q->curband++; if (q->curband >= q->bands) q->curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), q->curband))) { qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } } } return NULL; } static struct sk_buff *multiq_peek(struct Qdisc *sch) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned int curband = q->curband; struct Qdisc *qdisc; struct sk_buff *skb; int band; for (band = 0; band < q->bands; band++) { /* cycle through bands to ensure fairness */ curband++; if (curband >= q->bands) curband = 0; /* Check that target subqueue is available before * pulling an skb to avoid head-of-line blocking. */ if (!netif_xmit_stopped( netdev_get_tx_queue(qdisc_dev(sch), curband))) { qdisc = q->queues[curband]; skb = qdisc->ops->peek(qdisc); if (skb) return skb; } } return NULL; } static void multiq_reset(struct Qdisc *sch) { u16 band; struct multiq_sched_data *q = qdisc_priv(sch); for (band = 0; band < q->bands; band++) qdisc_reset(q->queues[band]); q->curband = 0; } static void multiq_destroy(struct Qdisc *sch) { int band; struct multiq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); for (band = 0; band < q->bands; band++) qdisc_put(q->queues[band]); kfree(q->queues); } static int multiq_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); struct tc_multiq_qopt *qopt; struct Qdisc **removed; int i, n_removed = 0; if (!netif_is_multiqueue(qdisc_dev(sch))) return -EOPNOTSUPP; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); qopt->bands = qdisc_dev(sch)->real_num_tx_queues; removed = kmalloc(sizeof(*removed) * (q->max_bands - qopt->bands), GFP_KERNEL); if (!removed) return -ENOMEM; sch_tree_lock(sch); q->bands = qopt->bands; for (i = q->bands; i < q->max_bands; i++) { if (q->queues[i] != &noop_qdisc) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; qdisc_purge_queue(child); removed[n_removed++] = child; } } sch_tree_unlock(sch); for (i = 0; i < n_removed; i++) qdisc_put(removed[i]); kfree(removed); for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (child) { sch_tree_lock(sch); old = q->queues[i]; q->queues[i] = child; if (child != &noop_qdisc) qdisc_hash_add(child, true); if (old != &noop_qdisc) qdisc_purge_queue(old); sch_tree_unlock(sch); qdisc_put(old); } } } return 0; } static int multiq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); int i, err; q->queues = NULL; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->max_bands = qdisc_dev(sch)->num_tx_queues; q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL); if (!q->queues) return -ENOBUFS; for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; return multiq_tune(sch, opt, extack); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_multiq_qopt opt; opt.bands = q->bands; opt.max_bands = q->max_bands; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); return 0; } static struct Qdisc * multiq_leaf(struct Qdisc *sch, unsigned long arg) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long multiq_find(struct Qdisc *sch, u32 classid) { struct multiq_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return multiq_find(sch, classid); } static void multiq_unbind(struct Qdisc *q, unsigned long cl) { } static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct multiq_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl - 1]->handle; return 0; } static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct multiq_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct multiq_sched_data *q = qdisc_priv(sch); int band; if (arg->stop) return; for (band = 0; band < q->bands; band++) { if (!tc_qdisc_stats_dump(sch, band + 1, arg)) break; } } static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct multiq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops multiq_class_ops = { .graft = multiq_graft, .leaf = multiq_leaf, .find = multiq_find, .walk = multiq_walk, .tcf_block = multiq_tcf_block, .bind_tcf = multiq_bind, .unbind_tcf = multiq_unbind, .dump = multiq_dump_class, .dump_stats = multiq_dump_class_stats, }; static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &multiq_class_ops, .id = "multiq", .priv_size = sizeof(struct multiq_sched_data), .enqueue = multiq_enqueue, .dequeue = multiq_dequeue, .peek = multiq_peek, .init = multiq_init, .reset = multiq_reset, .destroy = multiq_destroy, .change = multiq_tune, .dump = multiq_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("multiq"); static int __init multiq_module_init(void) { return register_qdisc(&multiq_qdisc_ops); } static void __exit multiq_module_exit(void) { unregister_qdisc(&multiq_qdisc_ops); } module_init(multiq_module_init) module_exit(multiq_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Multi queue to hardware queue mapping qdisc");
690 687 690 301 665 683 689 689 690 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 // SPDX-License-Identifier: GPL-2.0 /* * jump label x86 support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * */ #include <linux/jump_label.h> #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/jhash.h> #include <linux/cpu.h> #include <asm/kprobes.h> #include <asm/alternative.h> #include <asm/text-patching.h> #include <asm/insn.h> int arch_jump_entry_size(struct jump_entry *entry) { struct insn insn = {}; insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); BUG_ON(insn.length != 2 && insn.length != 5); return insn.length; } struct jump_label_patch { const void *code; int size; }; static struct jump_label_patch __jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { const void *expect, *code, *nop; const void *addr, *dest; int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); size = arch_jump_entry_size(entry); switch (size) { case JMP8_INSN_SIZE: code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; case JMP32_INSN_SIZE: code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); nop = x86_nops[size]; break; default: BUG(); } if (type == JUMP_LABEL_JMP) expect = nop; else expect = code; if (memcmp(addr, expect, size)) { /* * The location is not an op that we were expecting. * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", addr, addr, addr, expect, size, type); BUG(); } if (type == JUMP_LABEL_NOP) code = nop; return (struct jump_label_patch){.code = code, .size = size}; } static __always_inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that * system_state is SYSTEM_BOOTING guarantees it. It will be set to * SYSTEM_SCHEDULING before other cores are awaken and before the * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { mutex_lock(&text_mutex); __jump_label_transform(entry, type, init); mutex_unlock(&text_mutex); } void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { jump_label_transform(entry, type, 0); } bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* * Fallback to the non-batching mode. */ arch_jump_label_transform(entry, type); return true; } mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); text_poke_finish(); mutex_unlock(&text_mutex); }
23 3 20 20 4 9 5 24 24 24 4 4 5 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 // SPDX-License-Identifier: GPL-2.0-only /* * * Generic part shared by ipv4 and ipv6 backends. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <linux/in.h> #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; struct nft_xfrm { enum nft_xfrm_keys key:8; u8 dreg; u8 dir; u8 spnum; u8 len; }; static int nft_xfrm_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int len = 0; u32 spnum = 0; u8 dir; if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: break; default: return -EOPNOTSUPP; } priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY])); switch (priv->key) { case NFT_XFRM_KEY_REQID: case NFT_XFRM_KEY_SPI: len = sizeof(u32); break; case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: len = sizeof(struct in_addr); break; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: len = sizeof(struct in6_addr); break; default: return -EINVAL; } dir = nla_get_u8(tb[NFTA_XFRM_DIR]); switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_OUT: priv->dir = dir; break; default: return -EINVAL; } if (tb[NFTA_XFRM_SPNUM]) spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); if (spnum >= XFRM_MAX_DEPTH) return -ERANGE; priv->spnum = spnum; priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current * state does have a valid address (BEET, TUNNEL). */ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) { switch (k) { case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: if (family == NFPROTO_IPV4) break; return false; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: if (family == NFPROTO_IPV6) break; return false; default: return true; } return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, struct nft_regs *regs, const struct xfrm_state *state) { u32 *dest = &regs->data[priv->dreg]; if (!xfrm_state_addr_ok(priv->key, state->props.family, state->props.mode)) { regs->verdict.code = NFT_BREAK; return; } switch (priv->key) { case NFT_XFRM_KEY_UNSPEC: case __NFT_XFRM_KEY_MAX: WARN_ON_ONCE(1); break; case NFT_XFRM_KEY_DADDR_IP4: *dest = (__force __u32)state->id.daddr.a4; return; case NFT_XFRM_KEY_DADDR_IP6: memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_SADDR_IP4: *dest = (__force __u32)state->props.saddr.a4; return; case NFT_XFRM_KEY_SADDR_IP6: memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_REQID: *dest = state->props.reqid; return; case NFT_XFRM_KEY_SPI: *dest = (__force __u32)state->id.spi; return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { regs->verdict.code = NFT_BREAK; return; } state = sp->xvec[priv->spnum]; nft_xfrm_state_get_key(priv, regs, state); } static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct dst_entry *dst = skb_dst(pkt->skb); int i; for (i = 0; dst && dst->xfrm; dst = ((const struct xfrm_dst *)dst)->child, i++) { if (i < priv->spnum) continue; nft_xfrm_state_get_key(priv, regs, dst->xfrm); return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_xfrm *priv = nft_expr_priv(expr); switch (priv->dir) { case XFRM_POLICY_IN: nft_xfrm_get_eval_in(priv, regs, pkt); break; case XFRM_POLICY_OUT: nft_xfrm_get_eval_out(priv, regs, pkt); break; default: WARN_ON_ONCE(1); regs->verdict.code = NFT_BREAK; break; } } static int nft_xfrm_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) return -1; if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) return -1; if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) return -1; if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) return -1; return 0; } static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING); break; case XFRM_POLICY_OUT: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING); break; default: WARN_ON_ONCE(1); return -EINVAL; } return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_xfrm_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); const struct nft_xfrm *xfrm; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } xfrm = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != xfrm->key || priv->dreg != xfrm->dreg || priv->dir != xfrm->dir || priv->spnum != xfrm->spnum) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static struct nft_expr_type nft_xfrm_type; static const struct nft_expr_ops nft_xfrm_get_ops = { .type = &nft_xfrm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), .eval = nft_xfrm_get_eval, .init = nft_xfrm_get_init, .dump = nft_xfrm_get_dump, .validate = nft_xfrm_validate, .reduce = nft_xfrm_reduce, }; static struct nft_expr_type nft_xfrm_type __read_mostly = { .name = "xfrm", .ops = &nft_xfrm_get_ops, .policy = nft_xfrm_policy, .maxattr = NFTA_XFRM_MAX, .owner = THIS_MODULE, }; static int __init nft_xfrm_module_init(void) { return nft_register_expr(&nft_xfrm_type); } static void __exit nft_xfrm_module_exit(void) { nft_unregister_expr(&nft_xfrm_type); } module_init(nft_xfrm_module_init); module_exit(nft_xfrm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>"); MODULE_ALIAS_NFT_EXPR("xfrm");
1 3 3 1 2 1 1 1 1 1 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/openvswitch.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <net/netlink.h> #include <net/genetlink.h> #include "datapath.h" #include "meter.h" static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, }; static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; static u32 meter_hash(struct dp_meter_instance *ti, u32 id) { return id % ti->n_meters; } static void ovs_meter_free(struct dp_meter *meter) { if (!meter) return; kfree_rcu(meter, rcu); } /* Call with ovs_mutex or RCU read lock. */ static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; meter = rcu_dereference_ovsl(ti->dp_meters[hash]); if (meter && likely(meter->id == meter_id)) return meter; return NULL; } static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) { struct dp_meter_instance *ti; ti = kvzalloc(struct_size(ti, dp_meters, size), GFP_KERNEL); if (!ti) return NULL; ti->n_meters = size; return ti; } static void dp_meter_instance_free(struct dp_meter_instance *ti) { kvfree(ti); } static void dp_meter_instance_free_rcu(struct rcu_head *rcu) { struct dp_meter_instance *ti; ti = container_of(rcu, struct dp_meter_instance, rcu); kvfree(ti); } static int dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); int n_meters = min(size, ti->n_meters); struct dp_meter_instance *new_ti; int i; new_ti = dp_meter_instance_alloc(size); if (!new_ti) return -ENOMEM; for (i = 0; i < n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) new_ti->dp_meters[i] = ti->dp_meters[i]; rcu_assign_pointer(tbl->ti, new_ti); call_rcu(&ti->rcu, dp_meter_instance_free_rcu); return 0; } static void dp_meter_instance_insert(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); rcu_assign_pointer(ti->dp_meters[hash], meter); } static void dp_meter_instance_remove(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); RCU_INIT_POINTER(ti->dp_meters[hash], NULL); } static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter->id); int err; /* In generally, slots selected should be empty, because * OvS uses id-pool to fetch a available id. */ if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) return -EBUSY; dp_meter_instance_insert(ti, meter); /* That function is thread-safe. */ tbl->count++; if (tbl->count >= tbl->max_meters_allowed) { err = -EFBIG; goto attach_err; } if (tbl->count >= ti->n_meters && dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { err = -ENOMEM; goto attach_err; } return 0; attach_err: dp_meter_instance_remove(ti, meter); tbl->count--; return err; } static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti; ASSERT_OVSL(); if (!meter) return 0; ti = rcu_dereference_ovsl(tbl->ti); dp_meter_instance_remove(ti, meter); tbl->count--; /* Shrink the meter array if necessary. */ if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && tbl->count <= (ti->n_meters / 4)) { int half_size = ti->n_meters / 2; int i; /* Avoid hash collision, don't move slots to other place. * Make sure there are no references of meters in array * which will be released. */ for (i = half_size; i < ti->n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) goto out; if (dp_meter_instance_realloc(tbl, half_size)) goto shrink_err; } out: return 0; shrink_err: dp_meter_instance_insert(ti, meter); tbl->count++; return -ENOMEM; } static struct sk_buff * ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { struct sk_buff *skb; struct ovs_header *ovs_header = genl_info_userhdr(info); skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); *ovs_reply_header = genlmsg_put(skb, info->snd_portid, info->snd_seq, &dp_meter_genl_family, 0, cmd); if (!*ovs_reply_header) { nlmsg_free(skb); return ERR_PTR(-EMSGSIZE); } (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; return skb; } static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, struct dp_meter *meter) { struct nlattr *nla; struct dp_meter_band *band; u16 i; if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; if (nla_put(reply, OVS_METER_ATTR_STATS, sizeof(struct ovs_flow_stats), &meter->stats)) goto error; if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto error; band = meter->bands; for (i = 0; i < meter->n_bands; ++i, ++band) { struct nlattr *band_nla; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, sizeof(struct ovs_flow_stats), &band->stats)) goto error; nla_nest_end(reply, band_nla); } nla_nest_end(reply, nla); return 0; error: return -EMSGSIZE; } static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; struct sk_buff *reply; struct datapath *dp; int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, dp->meter_tbl.max_meters_allowed)) goto exit_unlock; ovs_unlock(); if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto nla_put_failure; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla) goto nla_put_failure; /* Currently only DROP band type is supported. */ if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) goto nla_put_failure; nla_nest_end(reply, band_nla); nla_nest_end(reply, nla); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nla_put_failure: nlmsg_free(reply); return err; } static struct dp_meter *dp_meter_create(struct nlattr **a) { struct nlattr *nla; int rem; u16 n_bands = 0; struct dp_meter *meter; struct dp_meter_band *band; int err; /* Validate attributes, count the bands. */ if (!a[OVS_METER_ATTR_BANDS]) return ERR_PTR(-EINVAL); nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) if (++n_bands > DP_MAX_BANDS) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); if (!meter) return ERR_PTR(-ENOMEM); meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]); meter->used = div_u64(ktime_get_ns(), 1000 * 1000); meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; spin_lock_init(&meter->lock); if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { meter->stats = *(struct ovs_flow_stats *) nla_data(a[OVS_METER_ATTR_STATS]); } meter->n_bands = n_bands; /* Set up meter bands. */ band = meter->bands; nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; u32 band_max_delta_t; err = nla_parse_deprecated((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, nla_data(nla), nla_len(nla), band_policy, NULL); if (err) goto exit_free_meter; if (!attr[OVS_BAND_ATTR_TYPE] || !attr[OVS_BAND_ATTR_RATE] || !attr[OVS_BAND_ATTR_BURST]) { err = -EINVAL; goto exit_free_meter; } band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); if (band->rate == 0) { err = -EINVAL; goto exit_free_meter; } band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); /* Figure out max delta_t that is enough to fill any bucket. * Keep max_delta_t size to the bucket units: * pkts => 1/1000 packets, kilobits => bits. * * Start with a full bucket. */ band->bucket = band->burst_size * 1000ULL; band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; } return meter; exit_free_meter: kfree(meter); return ERR_PTR(err); } static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct dp_meter *meter, *old_meter; struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = genl_info_userhdr(info); struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; bool failed; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter = dp_meter_create(a); if (IS_ERR(meter)) return PTR_ERR(meter); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, &ovs_reply_header); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto exit_free_meter; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(meter_tbl, meter_id); err = detach_meter(meter_tbl, old_meter); if (err) goto exit_unlock; err = attach_meter(meter_tbl, meter); if (err) goto exit_free_old_meter; ovs_unlock(); /* Build response with the meter_id and stats from * the old meter, if any. */ failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); WARN_ON(failed); if (old_meter) { spin_lock_bh(&old_meter->lock); if (old_meter->keep_stats) { err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); } spin_unlock_bh(&old_meter->lock); ovs_meter_free(old_meter); } genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_free_old_meter: ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); exit_free_meter: kfree(meter); return err; } static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } /* Locate meter, copy stats. */ meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; } spin_lock_bh(&meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); spin_unlock_bh(&meter->lock); if (err) goto exit_unlock; ovs_unlock(); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *old_meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); err = detach_meter(&dp->meter_tbl, old_meter); if (err) goto exit_unlock; } ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } /* Meter action execution. * * Return true 'meter_id' drop band is triggered. The 'skb' should be * dropped by the caller'. */ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; struct dp_meter_band *band; struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; u32 delta_ms; u32 cost; meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; /* Lock the meter while using it. */ spin_lock(&meter->lock); long_delta_ms = (now_ms - meter->used); /* ms */ if (long_delta_ms < 0) { /* This condition means that we have several threads fighting * for a meter lock, and the one who received the packets a * bit later wins. Assuming that all racing threads received * packets at the same time to avoid overflow. */ long_delta_ms = 0; } /* Make sure delta_ms will not be too large, so that bucket will not * wrap around below. */ delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) ? meter->max_delta_t : (u32)long_delta_ms; /* Update meter statistics. */ meter->used = now_ms; meter->stats.n_packets += 1; meter->stats.n_bytes += skb->len; /* Bucket rate is either in kilobits per second, or in packets per * second. We maintain the bucket in the units of either bits or * 1/1000th of a packet, correspondingly. * Then, when rate is multiplied with milliseconds, we get the * bucket units: * msec * kbps = bits, and * msec * packets/sec = 1/1000 packets. * * 'cost' is the number of bucket units in this packet. */ cost = (meter->kbps) ? skb->len * 8 : 1000; /* Update all bands and find the one hit with the highest rate. */ for (i = 0; i < meter->n_bands; ++i) { long long int max_bucket_size; band = &meter->bands[i]; max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) band->bucket = max_bucket_size; if (band->bucket >= cost) { band->bucket -= cost; } else if (band->rate > band_exceeded_rate) { band_exceeded_rate = band->rate; band_exceeded_max = i; } } if (band_exceeded_max >= 0) { /* Update band statistics. */ band = &meter->bands[band_exceeded_max]; band->stats.n_packets += 1; band->stats.n_bytes += skb->len; /* Drop band triggered, let the caller drop the 'skb'. */ if (band->type == OVS_METER_BAND_TYPE_DROP) { spin_unlock(&meter->lock); return true; } } spin_unlock(&meter->lock); return false; } static const struct genl_small_ops dp_meter_genl_ops[] = { { .cmd = OVS_METER_CMD_FEATURES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_features }, { .cmd = OVS_METER_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_set, }, { .cmd = OVS_METER_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_get, }, { .cmd = OVS_METER_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_del }, }; static const struct genl_multicast_group ovs_meter_multicast_group = { .name = OVS_METER_MCGROUP, }; struct genl_family dp_meter_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_METER_FAMILY, .version = OVS_METER_VERSION, .maxattr = OVS_METER_ATTR_MAX, .policy = meter_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_meter_genl_ops, .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops), .resv_start_op = OVS_METER_CMD_GET + 1, .mcgrps = &ovs_meter_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; int ovs_meters_init(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti; unsigned long free_mem_bytes; ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); if (!ti) return -ENOMEM; /* Allow meters in a datapath to use ~3.12% of physical memory. */ free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), DP_METER_NUM_MAX); if (!tbl->max_meters_allowed) goto out_err; rcu_assign_pointer(tbl->ti, ti); tbl->count = 0; return 0; out_err: dp_meter_instance_free(ti); return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; for (i = 0; i < ti->n_meters; i++) ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); dp_meter_instance_free(ti); }
5 12 11 1 2 1 1 1 1 5 5 6 5 12 1 11 11 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 // SPDX-License-Identifier: GPL-2.0 /* * Key setup for v1 encryption policies * * Copyright 2015, 2019 Google LLC */ /* * This file implements compatibility functions for the original encryption * policy version ("v1"), including: * * - Deriving per-file encryption keys using the AES-128-ECB based KDF * (rather than the new method of using HKDF-SHA512) * * - Retrieving fscrypt master keys from process-subscribed keyrings * (rather than the new method of using a filesystem-level keyring) * * - Handling policies with the DIRECT_KEY flag set using a master key table * (rather than the new method of implementing DIRECT_KEY with per-mode keys * managed alongside the master keys in the filesystem-level keyring) */ #include <crypto/skcipher.h> #include <crypto/utils.h> #include <keys/user-type.h> #include <linux/hashtable.h> #include <linux/scatterlist.h> #include "fscrypt_private.h" /* Table of keys referenced by DIRECT_KEY policies */ static DEFINE_HASHTABLE(fscrypt_direct_keys, 6); /* 6 bits = 64 buckets */ static DEFINE_SPINLOCK(fscrypt_direct_keys_lock); /* * v1 key derivation function. This generates the derived key by encrypting the * master key with AES-128-ECB using the nonce as the AES key. This provides a * unique derived key with sufficient entropy for each inode. However, it's * nonstandard, non-extensible, doesn't evenly distribute the entropy from the * master key, and is trivially reversible: an attacker who compromises a * derived key can "decrypt" it to get back to the master key, then derive any * other key. For all new code, use HKDF instead. * * The master key must be at least as long as the derived key. If the master * key is longer, then only the first 'derived_keysize' bytes are used. */ static int derive_key_aes(const u8 *master_key, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], u8 *derived_key, unsigned int derived_keysize) { int res = 0; struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); if (IS_ERR(tfm)) { res = PTR_ERR(tfm); tfm = NULL; goto out; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); req = skcipher_request_alloc(tfm, GFP_KERNEL); if (!req) { res = -ENOMEM; goto out; } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE); if (res < 0) goto out; sg_init_one(&src_sg, master_key, derived_keysize); sg_init_one(&dst_sg, derived_key, derived_keysize); skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, NULL); res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: skcipher_request_free(req); crypto_free_skcipher(tfm); return res; } /* * Search the current task's subscribed keyrings for a "logon" key with * description prefix:descriptor, and if found acquire a read lock on it and * return a pointer to its validated payload in *payload_ret. */ static struct key * find_and_lock_process_key(const char *prefix, const u8 descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE], unsigned int min_keysize, const struct fscrypt_key **payload_ret) { char *description; struct key *key; const struct user_key_payload *ukp; const struct fscrypt_key *payload; description = kasprintf(GFP_KERNEL, "%s%*phN", prefix, FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) return ERR_PTR(-ENOMEM); key = request_key(&key_type_logon, description, NULL); kfree(description); if (IS_ERR(key)) return key; down_read(&key->sem); ukp = user_key_payload_locked(key); if (!ukp) /* was the key revoked before we acquired its semaphore? */ goto invalid; payload = (const struct fscrypt_key *)ukp->data; if (ukp->datalen != sizeof(struct fscrypt_key) || payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) { fscrypt_warn(NULL, "key with description '%s' has invalid payload", key->description); goto invalid; } if (payload->size < min_keysize) { fscrypt_warn(NULL, "key with description '%s' is too short (got %u bytes, need %u+ bytes)", key->description, payload->size, min_keysize); goto invalid; } *payload_ret = payload; return key; invalid: up_read(&key->sem); key_put(key); return ERR_PTR(-ENOKEY); } /* Master key referenced by DIRECT_KEY policy */ struct fscrypt_direct_key { struct super_block *dk_sb; struct hlist_node dk_node; refcount_t dk_refcount; const struct fscrypt_mode *dk_mode; struct fscrypt_prepared_key dk_key; u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 dk_raw[FSCRYPT_MAX_KEY_SIZE]; }; static void free_direct_key(struct fscrypt_direct_key *dk) { if (dk) { fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key); kfree_sensitive(dk); } } void fscrypt_put_direct_key(struct fscrypt_direct_key *dk) { if (!refcount_dec_and_lock(&dk->dk_refcount, &fscrypt_direct_keys_lock)) return; hash_del(&dk->dk_node); spin_unlock(&fscrypt_direct_keys_lock); free_direct_key(dk); } /* * Find/insert the given key into the fscrypt_direct_keys table. If found, it * is returned with elevated refcount, and 'to_insert' is freed if non-NULL. If * not found, 'to_insert' is inserted and returned if it's non-NULL; otherwise * NULL is returned. */ static struct fscrypt_direct_key * find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, const u8 *raw_key, const struct fscrypt_inode_info *ci) { unsigned long hash_key; struct fscrypt_direct_key *dk; /* * Careful: to avoid potentially leaking secret key bytes via timing * information, we must key the hash table by descriptor rather than by * raw key, and use crypto_memneq() when comparing raw keys. */ BUILD_BUG_ON(sizeof(hash_key) > FSCRYPT_KEY_DESCRIPTOR_SIZE); memcpy(&hash_key, ci->ci_policy.v1.master_key_descriptor, sizeof(hash_key)); spin_lock(&fscrypt_direct_keys_lock); hash_for_each_possible(fscrypt_direct_keys, dk, dk_node, hash_key) { if (memcmp(ci->ci_policy.v1.master_key_descriptor, dk->dk_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE) != 0) continue; if (ci->ci_mode != dk->dk_mode) continue; if (!fscrypt_is_key_prepared(&dk->dk_key, ci)) continue; if (crypto_memneq(raw_key, dk->dk_raw, ci->ci_mode->keysize)) continue; /* using existing tfm with same (descriptor, mode, raw_key) */ refcount_inc(&dk->dk_refcount); spin_unlock(&fscrypt_direct_keys_lock); free_direct_key(to_insert); return dk; } if (to_insert) hash_add(fscrypt_direct_keys, &to_insert->dk_node, hash_key); spin_unlock(&fscrypt_direct_keys_lock); return to_insert; } /* Prepare to encrypt directly using the master key in the given mode */ static struct fscrypt_direct_key * fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key) { struct fscrypt_direct_key *dk; int err; /* Is there already a tfm for this key? */ dk = find_or_insert_direct_key(NULL, raw_key, ci); if (dk) return dk; /* Nope, allocate one. */ dk = kzalloc(sizeof(*dk), GFP_KERNEL); if (!dk) return ERR_PTR(-ENOMEM); dk->dk_sb = ci->ci_inode->i_sb; refcount_set(&dk->dk_refcount, 1); dk->dk_mode = ci->ci_mode; err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci); if (err) goto err_free_dk; memcpy(dk->dk_descriptor, ci->ci_policy.v1.master_key_descriptor, FSCRYPT_KEY_DESCRIPTOR_SIZE); memcpy(dk->dk_raw, raw_key, ci->ci_mode->keysize); return find_or_insert_direct_key(dk, raw_key, ci); err_free_dk: free_direct_key(dk); return ERR_PTR(err); } /* v1 policy, DIRECT_KEY: use the master key directly */ static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { struct fscrypt_direct_key *dk; dk = fscrypt_get_direct_key(ci, raw_master_key); if (IS_ERR(dk)) return PTR_ERR(dk); ci->ci_direct_key = dk; ci->ci_enc_key = dk->dk_key; return 0; } /* v1 policy, !DIRECT_KEY: derive the file's encryption key */ static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { u8 *derived_key; int err; /* * This cannot be a stack buffer because it will be passed to the * scatterlist crypto API during derive_key_aes(). */ derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL); if (!derived_key) return -ENOMEM; err = derive_key_aes(raw_master_key, ci->ci_nonce, derived_key, ci->ci_mode->keysize); if (err) goto out; err = fscrypt_set_per_file_enc_key(ci, derived_key); out: kfree_sensitive(derived_key); return err; } int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) return setup_v1_file_key_direct(ci, raw_master_key); else return setup_v1_file_key_derived(ci, raw_master_key); } int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci) { const struct super_block *sb = ci->ci_inode->i_sb; struct key *key; const struct fscrypt_key *payload; int err; key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) { key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); } if (IS_ERR(key)) return PTR_ERR(key); err = fscrypt_setup_v1_file_key(ci, payload->raw); up_read(&key->sem); key_put(key); return err; }
57 2 1 2 3 1 6 1 52 2 50 15 43 31 20 20 7 1 4 4 2 3 2 16 10 16 4 8 1 1 4 2 2 20 3 1 1 2 3 4 9 72 64 23 72 60 50 60 1 96 96 11 1 28 19 26 73 36 1 70 71 48 146 142 135 135 1 86 5 5 3 5 5 5 122 14 73 39 14 107 58 54 4 86 86 44 85 46 40 73 13 86 185 11 174 1 185 1 186 5 2 3 4 11 10 12 5 1 10 5 5 9 2 2 7 3 5 5 4 5 4 15 15 12 2 6 4 3 9 233 233 232 18 231 2 231 233 203 41 101 3 231 233 233 233 233 232 11 254 253 253 10 10 6 8 7 7 7 11 11 7 7 7 7 7 7 4 7 7 7 7 1 7 7 7 7 7 1 7 4 7 1 2 4 4 4 4 4 4 3 2 248 248 248 248 248 247 248 248 248 248 248 246 248 245 212 230 248 248 248 247 247 248 42 194 37 193 124 180 231 235 216 244 245 246 338 339 33 19 19 39 39 109 22 54 33 49 3 48 27 27 11 9 8 2 11 35 199 117 120 181 56 234 234 233 232 233 234 233 233 185 186 186 186 186 16 234 233 82 153 233 234 218 16 234 234 6 228 63 172 64 171 233 2 233 233 217 16 170 170 100 96 100 35 35 203 232 233 233 230 231 232 92 185 185 186 185 232 186 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 // SPDX-License-Identifier: GPL-2.0-only /* * The input core * * Copyright (c) 1999-2002 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_BASENAME ": " fmt #include <linux/init.h> #include <linux/types.h> #include <linux/idr.h> #include <linux/input/mt.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/major.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/pm.h> #include <linux/poll.h> #include <linux/device.h> #include <linux/kstrtox.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include "input-compat.h" #include "input-core-private.h" #include "input-poller.h" MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>"); MODULE_DESCRIPTION("Input core"); MODULE_LICENSE("GPL"); #define INPUT_MAX_CHAR_DEVICES 1024 #define INPUT_FIRST_DYNAMIC_DEV 256 static DEFINE_IDA(input_ida); static LIST_HEAD(input_dev_list); static LIST_HEAD(input_handler_list); /* * input_mutex protects access to both input_dev_list and input_handler_list. * This also causes input_[un]register_device and input_[un]register_handler * be mutually exclusive which simplifies locking in drivers implementing * input handlers. */ static DEFINE_MUTEX(input_mutex); static const struct input_value input_value_sync = { EV_SYN, SYN_REPORT, 1 }; static const unsigned int input_max_code[EV_CNT] = { [EV_KEY] = KEY_MAX, [EV_REL] = REL_MAX, [EV_ABS] = ABS_MAX, [EV_MSC] = MSC_MAX, [EV_SW] = SW_MAX, [EV_LED] = LED_MAX, [EV_SND] = SND_MAX, [EV_FF] = FF_MAX, }; static inline int is_event_supported(unsigned int code, unsigned long *bm, unsigned int max) { return code <= max && test_bit(code, bm); } static int input_defuzz_abs_event(int value, int old_val, int fuzz) { if (fuzz) { if (value > old_val - fuzz / 2 && value < old_val + fuzz / 2) return old_val; if (value > old_val - fuzz && value < old_val + fuzz) return (old_val * 3 + value) / 4; if (value > old_val - fuzz * 2 && value < old_val + fuzz * 2) return (old_val + value) / 2; } return value; } static void input_start_autorepeat(struct input_dev *dev, int code) { if (test_bit(EV_REP, dev->evbit) && dev->rep[REP_PERIOD] && dev->rep[REP_DELAY] && dev->timer.function) { dev->repeat_key = code; mod_timer(&dev->timer, jiffies + msecs_to_jiffies(dev->rep[REP_DELAY])); } } static void input_stop_autorepeat(struct input_dev *dev) { del_timer(&dev->timer); } /* * Pass values first through all filters and then, if event has not been * filtered out, through all open handles. This order is achieved by placing * filters at the head of the list of handles attached to the device, and * placing regular handles at the tail of the list. * * This function is called with dev->event_lock held and interrupts disabled. */ static void input_pass_values(struct input_dev *dev, struct input_value *vals, unsigned int count) { struct input_handle *handle; struct input_value *v; lockdep_assert_held(&dev->event_lock); scoped_guard(rcu) { handle = rcu_dereference(dev->grab); if (handle) { count = handle->handle_events(handle, vals, count); break; } list_for_each_entry_rcu(handle, &dev->h_list, d_node) { if (handle->open) { count = handle->handle_events(handle, vals, count); if (!count) break; } } } /* trigger auto repeat for key events */ if (test_bit(EV_REP, dev->evbit) && test_bit(EV_KEY, dev->evbit)) { for (v = vals; v != vals + count; v++) { if (v->type == EV_KEY && v->value != 2) { if (v->value) input_start_autorepeat(dev, v->code); else input_stop_autorepeat(dev); } } } } #define INPUT_IGNORE_EVENT 0 #define INPUT_PASS_TO_HANDLERS 1 #define INPUT_PASS_TO_DEVICE 2 #define INPUT_SLOT 4 #define INPUT_FLUSH 8 #define INPUT_PASS_TO_ALL (INPUT_PASS_TO_HANDLERS | INPUT_PASS_TO_DEVICE) static int input_handle_abs_event(struct input_dev *dev, unsigned int code, int *pval) { struct input_mt *mt = dev->mt; bool is_new_slot = false; bool is_mt_event; int *pold; if (code == ABS_MT_SLOT) { /* * "Stage" the event; we'll flush it later, when we * get actual touch data. */ if (mt && *pval >= 0 && *pval < mt->num_slots) mt->slot = *pval; return INPUT_IGNORE_EVENT; } is_mt_event = input_is_mt_value(code); if (!is_mt_event) { pold = &dev->absinfo[code].value; } else if (mt) { pold = &mt->slots[mt->slot].abs[code - ABS_MT_FIRST]; is_new_slot = mt->slot != dev->absinfo[ABS_MT_SLOT].value; } else { /* * Bypass filtering for multi-touch events when * not employing slots. */ pold = NULL; } if (pold) { *pval = input_defuzz_abs_event(*pval, *pold, dev->absinfo[code].fuzz); if (*pold == *pval) return INPUT_IGNORE_EVENT; *pold = *pval; } /* Flush pending "slot" event */ if (is_new_slot) { dev->absinfo[ABS_MT_SLOT].value = mt->slot; return INPUT_PASS_TO_HANDLERS | INPUT_SLOT; } return INPUT_PASS_TO_HANDLERS; } static int input_get_disposition(struct input_dev *dev, unsigned int type, unsigned int code, int *pval) { int disposition = INPUT_IGNORE_EVENT; int value = *pval; /* filter-out events from inhibited devices */ if (dev->inhibited) return INPUT_IGNORE_EVENT; switch (type) { case EV_SYN: switch (code) { case SYN_CONFIG: disposition = INPUT_PASS_TO_ALL; break; case SYN_REPORT: disposition = INPUT_PASS_TO_HANDLERS | INPUT_FLUSH; break; case SYN_MT_REPORT: disposition = INPUT_PASS_TO_HANDLERS; break; } break; case EV_KEY: if (is_event_supported(code, dev->keybit, KEY_MAX)) { /* auto-repeat bypasses state updates */ if (value == 2) { disposition = INPUT_PASS_TO_HANDLERS; break; } if (!!test_bit(code, dev->key) != !!value) { __change_bit(code, dev->key); disposition = INPUT_PASS_TO_HANDLERS; } } break; case EV_SW: if (is_event_supported(code, dev->swbit, SW_MAX) && !!test_bit(code, dev->sw) != !!value) { __change_bit(code, dev->sw); disposition = INPUT_PASS_TO_HANDLERS; } break; case EV_ABS: if (is_event_supported(code, dev->absbit, ABS_MAX)) disposition = input_handle_abs_event(dev, code, &value); break; case EV_REL: if (is_event_supported(code, dev->relbit, REL_MAX) && value) disposition = INPUT_PASS_TO_HANDLERS; break; case EV_MSC: if (is_event_supported(code, dev->mscbit, MSC_MAX)) disposition = INPUT_PASS_TO_ALL; break; case EV_LED: if (is_event_supported(code, dev->ledbit, LED_MAX) && !!test_bit(code, dev->led) != !!value) { __change_bit(code, dev->led); disposition = INPUT_PASS_TO_ALL; } break; case EV_SND: if (is_event_supported(code, dev->sndbit, SND_MAX)) { if (!!test_bit(code, dev->snd) != !!value) __change_bit(code, dev->snd); disposition = INPUT_PASS_TO_ALL; } break; case EV_REP: if (code <= REP_MAX && value >= 0 && dev->rep[code] != value) { dev->rep[code] = value; disposition = INPUT_PASS_TO_ALL; } break; case EV_FF: if (value >= 0) disposition = INPUT_PASS_TO_ALL; break; case EV_PWR: disposition = INPUT_PASS_TO_ALL; break; } *pval = value; return disposition; } static void input_event_dispose(struct input_dev *dev, int disposition, unsigned int type, unsigned int code, int value) { if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event) dev->event(dev, type, code, value); if (disposition & INPUT_PASS_TO_HANDLERS) { struct input_value *v; if (disposition & INPUT_SLOT) { v = &dev->vals[dev->num_vals++]; v->type = EV_ABS; v->code = ABS_MT_SLOT; v->value = dev->mt->slot; } v = &dev->vals[dev->num_vals++]; v->type = type; v->code = code; v->value = value; } if (disposition & INPUT_FLUSH) { if (dev->num_vals >= 2) input_pass_values(dev, dev->vals, dev->num_vals); dev->num_vals = 0; /* * Reset the timestamp on flush so we won't end up * with a stale one. Note we only need to reset the * monolithic one as we use its presence when deciding * whether to generate a synthetic timestamp. */ dev->timestamp[INPUT_CLK_MONO] = ktime_set(0, 0); } else if (dev->num_vals >= dev->max_vals - 2) { dev->vals[dev->num_vals++] = input_value_sync; input_pass_values(dev, dev->vals, dev->num_vals); dev->num_vals = 0; } } void input_handle_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { int disposition; lockdep_assert_held(&dev->event_lock); disposition = input_get_disposition(dev, type, code, &value); if (disposition != INPUT_IGNORE_EVENT) { if (type != EV_SYN) add_input_randomness(type, code, value); input_event_dispose(dev, disposition, type, code, value); } } /** * input_event() - report new input event * @dev: device that generated the event * @type: type of the event * @code: event code * @value: value of the event * * This function should be used by drivers implementing various input * devices to report input events. See also input_inject_event(). * * NOTE: input_event() may be safely used right after input device was * allocated with input_allocate_device(), even before it is registered * with input_register_device(), but the event will not reach any of the * input handlers. Such early invocation of input_event() may be used * to 'seed' initial state of a switch or initial position of absolute * axis, etc. */ void input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { if (is_event_supported(type, dev->evbit, EV_MAX)) { guard(spinlock_irqsave)(&dev->event_lock); input_handle_event(dev, type, code, value); } } EXPORT_SYMBOL(input_event); /** * input_inject_event() - send input event from input handler * @handle: input handle to send event through * @type: type of the event * @code: event code * @value: value of the event * * Similar to input_event() but will ignore event if device is * "grabbed" and handle injecting event is not the one that owns * the device. */ void input_inject_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { struct input_dev *dev = handle->dev; struct input_handle *grab; if (is_event_supported(type, dev->evbit, EV_MAX)) { guard(spinlock_irqsave)(&dev->event_lock); guard(rcu)(); grab = rcu_dereference(dev->grab); if (!grab || grab == handle) input_handle_event(dev, type, code, value); } } EXPORT_SYMBOL(input_inject_event); /** * input_alloc_absinfo - allocates array of input_absinfo structs * @dev: the input device emitting absolute events * * If the absinfo struct the caller asked for is already allocated, this * functions will not do anything. */ void input_alloc_absinfo(struct input_dev *dev) { if (dev->absinfo) return; dev->absinfo = kcalloc(ABS_CNT, sizeof(*dev->absinfo), GFP_KERNEL); if (!dev->absinfo) { dev_err(dev->dev.parent ?: &dev->dev, "%s: unable to allocate memory\n", __func__); /* * We will handle this allocation failure in * input_register_device() when we refuse to register input * device with ABS bits but without absinfo. */ } } EXPORT_SYMBOL(input_alloc_absinfo); void input_set_abs_params(struct input_dev *dev, unsigned int axis, int min, int max, int fuzz, int flat) { struct input_absinfo *absinfo; __set_bit(EV_ABS, dev->evbit); __set_bit(axis, dev->absbit); input_alloc_absinfo(dev); if (!dev->absinfo) return; absinfo = &dev->absinfo[axis]; absinfo->minimum = min; absinfo->maximum = max; absinfo->fuzz = fuzz; absinfo->flat = flat; } EXPORT_SYMBOL(input_set_abs_params); /** * input_copy_abs - Copy absinfo from one input_dev to another * @dst: Destination input device to copy the abs settings to * @dst_axis: ABS_* value selecting the destination axis * @src: Source input device to copy the abs settings from * @src_axis: ABS_* value selecting the source axis * * Set absinfo for the selected destination axis by copying it from * the specified source input device's source axis. * This is useful to e.g. setup a pen/stylus input-device for combined * touchscreen/pen hardware where the pen uses the same coordinates as * the touchscreen. */ void input_copy_abs(struct input_dev *dst, unsigned int dst_axis, const struct input_dev *src, unsigned int src_axis) { /* src must have EV_ABS and src_axis set */ if (WARN_ON(!(test_bit(EV_ABS, src->evbit) && test_bit(src_axis, src->absbit)))) return; /* * input_alloc_absinfo() may have failed for the source. Our caller is * expected to catch this when registering the input devices, which may * happen after the input_copy_abs() call. */ if (!src->absinfo) return; input_set_capability(dst, EV_ABS, dst_axis); if (!dst->absinfo) return; dst->absinfo[dst_axis] = src->absinfo[src_axis]; } EXPORT_SYMBOL(input_copy_abs); /** * input_grab_device - grabs device for exclusive use * @handle: input handle that wants to own the device * * When a device is grabbed by an input handle all events generated by * the device are delivered only to this handle. Also events injected * by other input handles are ignored while device is grabbed. */ int input_grab_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { if (dev->grab) return -EBUSY; rcu_assign_pointer(dev->grab, handle); } return 0; } EXPORT_SYMBOL(input_grab_device); static void __input_release_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; struct input_handle *grabber; grabber = rcu_dereference_protected(dev->grab, lockdep_is_held(&dev->mutex)); if (grabber == handle) { rcu_assign_pointer(dev->grab, NULL); /* Make sure input_pass_values() notices that grab is gone */ synchronize_rcu(); list_for_each_entry(handle, &dev->h_list, d_node) if (handle->open && handle->handler->start) handle->handler->start(handle); } } /** * input_release_device - release previously grabbed device * @handle: input handle that owns the device * * Releases previously grabbed device so that other input handles can * start receiving input events. Upon release all handlers attached * to the device have their start() method called so they have a change * to synchronize device state with the rest of the system. */ void input_release_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; guard(mutex)(&dev->mutex); __input_release_device(handle); } EXPORT_SYMBOL(input_release_device); /** * input_open_device - open input device * @handle: handle through which device is being accessed * * This function should be called by input handlers when they * want to start receive events from given input device. */ int input_open_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; int error; scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { if (dev->going_away) return -ENODEV; handle->open++; if (handle->handler->passive_observer) return 0; if (dev->users++ || dev->inhibited) { /* * Device is already opened and/or inhibited, * so we can exit immediately and report success. */ return 0; } if (dev->open) { error = dev->open(dev); if (error) { dev->users--; handle->open--; /* * Make sure we are not delivering any more * events through this handle. */ synchronize_rcu(); return error; } } if (dev->poller) input_dev_poller_start(dev->poller); } return 0; } EXPORT_SYMBOL(input_open_device); int input_flush_device(struct input_handle *handle, struct file *file) { struct input_dev *dev = handle->dev; scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { if (dev->flush) return dev->flush(dev, file); } return 0; } EXPORT_SYMBOL(input_flush_device); /** * input_close_device - close input device * @handle: handle through which device is being accessed * * This function should be called by input handlers when they * want to stop receive events from given input device. */ void input_close_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; guard(mutex)(&dev->mutex); __input_release_device(handle); if (!handle->handler->passive_observer) { if (!--dev->users && !dev->inhibited) { if (dev->poller) input_dev_poller_stop(dev->poller); if (dev->close) dev->close(dev); } } if (!--handle->open) { /* * synchronize_rcu() makes sure that input_pass_values() * completed and that no more input events are delivered * through this handle */ synchronize_rcu(); } } EXPORT_SYMBOL(input_close_device); /* * Simulate keyup events for all keys that are marked as pressed. * The function must be called with dev->event_lock held. */ static bool input_dev_release_keys(struct input_dev *dev) { bool need_sync = false; int code; lockdep_assert_held(&dev->event_lock); if (is_event_supported(EV_KEY, dev->evbit, EV_MAX)) { for_each_set_bit(code, dev->key, KEY_CNT) { input_handle_event(dev, EV_KEY, code, 0); need_sync = true; } } return need_sync; } /* * Prepare device for unregistering */ static void input_disconnect_device(struct input_dev *dev) { struct input_handle *handle; /* * Mark device as going away. Note that we take dev->mutex here * not to protect access to dev->going_away but rather to ensure * that there are no threads in the middle of input_open_device() */ scoped_guard(mutex, &dev->mutex) dev->going_away = true; guard(spinlock_irq)(&dev->event_lock); /* * Simulate keyup events for all pressed keys so that handlers * are not left with "stuck" keys. The driver may continue * generate events even after we done here but they will not * reach any handlers. */ if (input_dev_release_keys(dev)) input_handle_event(dev, EV_SYN, SYN_REPORT, 1); list_for_each_entry(handle, &dev->h_list, d_node) handle->open = 0; } /** * input_scancode_to_scalar() - converts scancode in &struct input_keymap_entry * @ke: keymap entry containing scancode to be converted. * @scancode: pointer to the location where converted scancode should * be stored. * * This function is used to convert scancode stored in &struct keymap_entry * into scalar form understood by legacy keymap handling methods. These * methods expect scancodes to be represented as 'unsigned int'. */ int input_scancode_to_scalar(const struct input_keymap_entry *ke, unsigned int *scancode) { switch (ke->len) { case 1: *scancode = *((u8 *)ke->scancode); break; case 2: *scancode = *((u16 *)ke->scancode); break; case 4: *scancode = *((u32 *)ke->scancode); break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(input_scancode_to_scalar); /* * Those routines handle the default case where no [gs]etkeycode() is * defined. In this case, an array indexed by the scancode is used. */ static unsigned int input_fetch_keycode(struct input_dev *dev, unsigned int index) { switch (dev->keycodesize) { case 1: return ((u8 *)dev->keycode)[index]; case 2: return ((u16 *)dev->keycode)[index]; default: return ((u32 *)dev->keycode)[index]; } } static int input_default_getkeycode(struct input_dev *dev, struct input_keymap_entry *ke) { unsigned int index; int error; if (!dev->keycodesize) return -EINVAL; if (ke->flags & INPUT_KEYMAP_BY_INDEX) index = ke->index; else { error = input_scancode_to_scalar(ke, &index); if (error) return error; } if (index >= dev->keycodemax) return -EINVAL; ke->keycode = input_fetch_keycode(dev, index); ke->index = index; ke->len = sizeof(index); memcpy(ke->scancode, &index, sizeof(index)); return 0; } static int input_default_setkeycode(struct input_dev *dev, const struct input_keymap_entry *ke, unsigned int *old_keycode) { unsigned int index; int error; int i; if (!dev->keycodesize) return -EINVAL; if (ke->flags & INPUT_KEYMAP_BY_INDEX) { index = ke->index; } else { error = input_scancode_to_scalar(ke, &index); if (error) return error; } if (index >= dev->keycodemax) return -EINVAL; if (dev->keycodesize < sizeof(ke->keycode) && (ke->keycode >> (dev->keycodesize * 8))) return -EINVAL; switch (dev->keycodesize) { case 1: { u8 *k = (u8 *)dev->keycode; *old_keycode = k[index]; k[index] = ke->keycode; break; } case 2: { u16 *k = (u16 *)dev->keycode; *old_keycode = k[index]; k[index] = ke->keycode; break; } default: { u32 *k = (u32 *)dev->keycode; *old_keycode = k[index]; k[index] = ke->keycode; break; } } if (*old_keycode <= KEY_MAX) { __clear_bit(*old_keycode, dev->keybit); for (i = 0; i < dev->keycodemax; i++) { if (input_fetch_keycode(dev, i) == *old_keycode) { __set_bit(*old_keycode, dev->keybit); /* Setting the bit twice is useless, so break */ break; } } } __set_bit(ke->keycode, dev->keybit); return 0; } /** * input_get_keycode - retrieve keycode currently mapped to a given scancode * @dev: input device which keymap is being queried * @ke: keymap entry * * This function should be called by anyone interested in retrieving current * keymap. Presently evdev handlers use it. */ int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke) { guard(spinlock_irqsave)(&dev->event_lock); return dev->getkeycode(dev, ke); } EXPORT_SYMBOL(input_get_keycode); /** * input_set_keycode - attribute a keycode to a given scancode * @dev: input device which keymap is being updated * @ke: new keymap entry * * This function should be called by anyone needing to update current * keymap. Presently keyboard and evdev handlers use it. */ int input_set_keycode(struct input_dev *dev, const struct input_keymap_entry *ke) { unsigned int old_keycode; int error; if (ke->keycode > KEY_MAX) return -EINVAL; guard(spinlock_irqsave)(&dev->event_lock); error = dev->setkeycode(dev, ke, &old_keycode); if (error) return error; /* Make sure KEY_RESERVED did not get enabled. */ __clear_bit(KEY_RESERVED, dev->keybit); /* * Simulate keyup event if keycode is not present * in the keymap anymore */ if (old_keycode > KEY_MAX) { dev_warn(dev->dev.parent ?: &dev->dev, "%s: got too big old keycode %#x\n", __func__, old_keycode); } else if (test_bit(EV_KEY, dev->evbit) && !is_event_supported(old_keycode, dev->keybit, KEY_MAX) && __test_and_clear_bit(old_keycode, dev->key)) { /* * We have to use input_event_dispose() here directly instead * of input_handle_event() because the key we want to release * here is considered no longer supported by the device and * input_handle_event() will ignore it. */ input_event_dispose(dev, INPUT_PASS_TO_HANDLERS, EV_KEY, old_keycode, 0); input_event_dispose(dev, INPUT_PASS_TO_HANDLERS | INPUT_FLUSH, EV_SYN, SYN_REPORT, 1); } return 0; } EXPORT_SYMBOL(input_set_keycode); bool input_match_device_id(const struct input_dev *dev, const struct input_device_id *id) { if (id->flags & INPUT_DEVICE_ID_MATCH_BUS) if (id->bustype != dev->id.bustype) return false; if (id->flags & INPUT_DEVICE_ID_MATCH_VENDOR) if (id->vendor != dev->id.vendor) return false; if (id->flags & INPUT_DEVICE_ID_MATCH_PRODUCT) if (id->product != dev->id.product) return false; if (id->flags & INPUT_DEVICE_ID_MATCH_VERSION) if (id->version != dev->id.version) return false; if (!bitmap_subset(id->evbit, dev->evbit, EV_MAX) || !bitmap_subset(id->keybit, dev->keybit, KEY_MAX) || !bitmap_subset(id->relbit, dev->relbit, REL_MAX) || !bitmap_subset(id->absbit, dev->absbit, ABS_MAX) || !bitmap_subset(id->mscbit, dev->mscbit, MSC_MAX) || !bitmap_subset(id->ledbit, dev->ledbit, LED_MAX) || !bitmap_subset(id->sndbit, dev->sndbit, SND_MAX) || !bitmap_subset(id->ffbit, dev->ffbit, FF_MAX) || !bitmap_subset(id->swbit, dev->swbit, SW_MAX) || !bitmap_subset(id->propbit, dev->propbit, INPUT_PROP_MAX)) { return false; } return true; } EXPORT_SYMBOL(input_match_device_id); static const struct input_device_id *input_match_device(struct input_handler *handler, struct input_dev *dev) { const struct input_device_id *id; for (id = handler->id_table; id->flags || id->driver_info; id++) { if (input_match_device_id(dev, id) && (!handler->match || handler->match(handler, dev))) { return id; } } return NULL; } static int input_attach_handler(struct input_dev *dev, struct input_handler *handler) { const struct input_device_id *id; int error; id = input_match_device(handler, dev); if (!id) return -ENODEV; error = handler->connect(handler, dev, id); if (error && error != -ENODEV) pr_err("failed to attach handler %s to device %s, error: %d\n", handler->name, kobject_name(&dev->dev.kobj), error); return error; } #ifdef CONFIG_COMPAT static int input_bits_to_string(char *buf, int buf_size, unsigned long bits, bool skip_empty) { int len = 0; if (in_compat_syscall()) { u32 dword = bits >> 32; if (dword || !skip_empty) len += snprintf(buf, buf_size, "%x ", dword); dword = bits & 0xffffffffUL; if (dword || !skip_empty || len) len += snprintf(buf + len, max(buf_size - len, 0), "%x", dword); } else { if (bits || !skip_empty) len += snprintf(buf, buf_size, "%lx", bits); } return len; } #else /* !CONFIG_COMPAT */ static int input_bits_to_string(char *buf, int buf_size, unsigned long bits, bool skip_empty) { return bits || !skip_empty ? snprintf(buf, buf_size, "%lx", bits) : 0; } #endif #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_bus_input_dir; static DECLARE_WAIT_QUEUE_HEAD(input_devices_poll_wait); static int input_devices_state; static inline void input_wakeup_procfs_readers(void) { input_devices_state++; wake_up(&input_devices_poll_wait); } struct input_seq_state { unsigned short pos; bool mutex_acquired; int input_devices_state; }; static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; struct input_seq_state *state = seq->private; poll_wait(file, &input_devices_poll_wait, wait); if (state->input_devices_state != input_devices_state) { state->input_devices_state = input_devices_state; return EPOLLIN | EPOLLRDNORM; } return 0; } static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos) { struct input_seq_state *state = seq->private; int error; error = mutex_lock_interruptible(&input_mutex); if (error) { state->mutex_acquired = false; return ERR_PTR(error); } state->mutex_acquired = true; return seq_list_start(&input_dev_list, *pos); } static void *input_devices_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &input_dev_list, pos); } static void input_seq_stop(struct seq_file *seq, void *v) { struct input_seq_state *state = seq->private; if (state->mutex_acquired) mutex_unlock(&input_mutex); } static void input_seq_print_bitmap(struct seq_file *seq, const char *name, unsigned long *bitmap, int max) { int i; bool skip_empty = true; char buf[18]; seq_printf(seq, "B: %s=", name); for (i = BITS_TO_LONGS(max) - 1; i >= 0; i--) { if (input_bits_to_string(buf, sizeof(buf), bitmap[i], skip_empty)) { skip_empty = false; seq_printf(seq, "%s%s", buf, i > 0 ? " " : ""); } } /* * If no output was produced print a single 0. */ if (skip_empty) seq_putc(seq, '0'); seq_putc(seq, '\n'); } static int input_devices_seq_show(struct seq_file *seq, void *v) { struct input_dev *dev = container_of(v, struct input_dev, node); const char *path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); struct input_handle *handle; seq_printf(seq, "I: Bus=%04x Vendor=%04x Product=%04x Version=%04x\n", dev->id.bustype, dev->id.vendor, dev->id.product, dev->id.version); seq_printf(seq, "N: Name=\"%s\"\n", dev->name ? dev->name : ""); seq_printf(seq, "P: Phys=%s\n", dev->phys ? dev->phys : ""); seq_printf(seq, "S: Sysfs=%s\n", path ? path : ""); seq_printf(seq, "U: Uniq=%s\n", dev->uniq ? dev->uniq : ""); seq_puts(seq, "H: Handlers="); list_for_each_entry(handle, &dev->h_list, d_node) seq_printf(seq, "%s ", handle->name); seq_putc(seq, '\n'); input_seq_print_bitmap(seq, "PROP", dev->propbit, INPUT_PROP_MAX); input_seq_print_bitmap(seq, "EV", dev->evbit, EV_MAX); if (test_bit(EV_KEY, dev->evbit)) input_seq_print_bitmap(seq, "KEY", dev->keybit, KEY_MAX); if (test_bit(EV_REL, dev->evbit)) input_seq_print_bitmap(seq, "REL", dev->relbit, REL_MAX); if (test_bit(EV_ABS, dev->evbit)) input_seq_print_bitmap(seq, "ABS", dev->absbit, ABS_MAX); if (test_bit(EV_MSC, dev->evbit)) input_seq_print_bitmap(seq, "MSC", dev->mscbit, MSC_MAX); if (test_bit(EV_LED, dev->evbit)) input_seq_print_bitmap(seq, "LED", dev->ledbit, LED_MAX); if (test_bit(EV_SND, dev->evbit)) input_seq_print_bitmap(seq, "SND", dev->sndbit, SND_MAX); if (test_bit(EV_FF, dev->evbit)) input_seq_print_bitmap(seq, "FF", dev->ffbit, FF_MAX); if (test_bit(EV_SW, dev->evbit)) input_seq_print_bitmap(seq, "SW", dev->swbit, SW_MAX); seq_putc(seq, '\n'); kfree(path); return 0; } static const struct seq_operations input_devices_seq_ops = { .start = input_devices_seq_start, .next = input_devices_seq_next, .stop = input_seq_stop, .show = input_devices_seq_show, }; static int input_proc_devices_open(struct inode *inode, struct file *file) { return seq_open_private(file, &input_devices_seq_ops, sizeof(struct input_seq_state)); } static const struct proc_ops input_devices_proc_ops = { .proc_open = input_proc_devices_open, .proc_poll = input_proc_devices_poll, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, }; static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos) { struct input_seq_state *state = seq->private; int error; error = mutex_lock_interruptible(&input_mutex); if (error) { state->mutex_acquired = false; return ERR_PTR(error); } state->mutex_acquired = true; state->pos = *pos; return seq_list_start(&input_handler_list, *pos); } static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct input_seq_state *state = seq->private; state->pos = *pos + 1; return seq_list_next(v, &input_handler_list, pos); } static int input_handlers_seq_show(struct seq_file *seq, void *v) { struct input_handler *handler = container_of(v, struct input_handler, node); struct input_seq_state *state = seq->private; seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name); if (handler->filter) seq_puts(seq, " (filter)"); if (handler->legacy_minors) seq_printf(seq, " Minor=%d", handler->minor); seq_putc(seq, '\n'); return 0; } static const struct seq_operations input_handlers_seq_ops = { .start = input_handlers_seq_start, .next = input_handlers_seq_next, .stop = input_seq_stop, .show = input_handlers_seq_show, }; static int input_proc_handlers_open(struct inode *inode, struct file *file) { return seq_open_private(file, &input_handlers_seq_ops, sizeof(struct input_seq_state)); } static const struct proc_ops input_handlers_proc_ops = { .proc_open = input_proc_handlers_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, }; static int __init input_proc_init(void) { struct proc_dir_entry *entry; proc_bus_input_dir = proc_mkdir("bus/input", NULL); if (!proc_bus_input_dir) return -ENOMEM; entry = proc_create("devices", 0, proc_bus_input_dir, &input_devices_proc_ops); if (!entry) goto fail1; entry = proc_create("handlers", 0, proc_bus_input_dir, &input_handlers_proc_ops); if (!entry) goto fail2; return 0; fail2: remove_proc_entry("devices", proc_bus_input_dir); fail1: remove_proc_entry("bus/input", NULL); return -ENOMEM; } static void input_proc_exit(void) { remove_proc_entry("devices", proc_bus_input_dir); remove_proc_entry("handlers", proc_bus_input_dir); remove_proc_entry("bus/input", NULL); } #else /* !CONFIG_PROC_FS */ static inline void input_wakeup_procfs_readers(void) { } static inline int input_proc_init(void) { return 0; } static inline void input_proc_exit(void) { } #endif #define INPUT_DEV_STRING_ATTR_SHOW(name) \ static ssize_t input_dev_show_##name(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct input_dev *input_dev = to_input_dev(dev); \ \ return sysfs_emit(buf, "%s\n", \ input_dev->name ? input_dev->name : ""); \ } \ static DEVICE_ATTR(name, S_IRUGO, input_dev_show_##name, NULL) INPUT_DEV_STRING_ATTR_SHOW(name); INPUT_DEV_STRING_ATTR_SHOW(phys); INPUT_DEV_STRING_ATTR_SHOW(uniq); static int input_print_modalias_bits(char *buf, int size, char name, const unsigned long *bm, unsigned int min_bit, unsigned int max_bit) { int bit = min_bit; int len = 0; len += snprintf(buf, max(size, 0), "%c", name); for_each_set_bit_from(bit, bm, max_bit) len += snprintf(buf + len, max(size - len, 0), "%X,", bit); return len; } static int input_print_modalias_parts(char *buf, int size, int full_len, const struct input_dev *id) { int len, klen, remainder, space; len = snprintf(buf, max(size, 0), "input:b%04Xv%04Xp%04Xe%04X-", id->id.bustype, id->id.vendor, id->id.product, id->id.version); len += input_print_modalias_bits(buf + len, size - len, 'e', id->evbit, 0, EV_MAX); /* * Calculate the remaining space in the buffer making sure we * have place for the terminating 0. */ space = max(size - (len + 1), 0); klen = input_print_modalias_bits(buf + len, size - len, 'k', id->keybit, KEY_MIN_INTERESTING, KEY_MAX); len += klen; /* * If we have more data than we can fit in the buffer, check * if we can trim key data to fit in the rest. We will indicate * that key data is incomplete by adding "+" sign at the end, like * this: * "k1,2,3,45,+,". * * Note that we shortest key info (if present) is "k+," so we * can only try to trim if key data is longer than that. */ if (full_len && size < full_len + 1 && klen > 3) { remainder = full_len - len; /* * We can only trim if we have space for the remainder * and also for at least "k+," which is 3 more characters. */ if (remainder <= space - 3) { /* * We are guaranteed to have 'k' in the buffer, so * we need at least 3 additional bytes for storing * "+," in addition to the remainder. */ for (int i = size - 1 - remainder - 3; i >= 0; i--) { if (buf[i] == 'k' || buf[i] == ',') { strcpy(buf + i + 1, "+,"); len = i + 3; /* Not counting '\0' */ break; } } } } len += input_print_modalias_bits(buf + len, size - len, 'r', id->relbit, 0, REL_MAX); len += input_print_modalias_bits(buf + len, size - len, 'a', id->absbit, 0, ABS_MAX); len += input_print_modalias_bits(buf + len, size - len, 'm', id->mscbit, 0, MSC_MAX); len += input_print_modalias_bits(buf + len, size - len, 'l', id->ledbit, 0, LED_MAX); len += input_print_modalias_bits(buf + len, size - len, 's', id->sndbit, 0, SND_MAX); len += input_print_modalias_bits(buf + len, size - len, 'f', id->ffbit, 0, FF_MAX); len += input_print_modalias_bits(buf + len, size - len, 'w', id->swbit, 0, SW_MAX); return len; } static int input_print_modalias(char *buf, int size, const struct input_dev *id) { int full_len; /* * Printing is done in 2 passes: first one figures out total length * needed for the modalias string, second one will try to trim key * data in case when buffer is too small for the entire modalias. * If the buffer is too small regardless, it will fill as much as it * can (without trimming key data) into the buffer and leave it to * the caller to figure out what to do with the result. */ full_len = input_print_modalias_parts(NULL, 0, 0, id); return input_print_modalias_parts(buf, size, full_len, id); } static ssize_t input_dev_show_modalias(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *id = to_input_dev(dev); ssize_t len; len = input_print_modalias(buf, PAGE_SIZE, id); if (len < PAGE_SIZE - 2) len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return min_t(int, len, PAGE_SIZE); } static DEVICE_ATTR(modalias, S_IRUGO, input_dev_show_modalias, NULL); static int input_print_bitmap(char *buf, int buf_size, const unsigned long *bitmap, int max, int add_cr); static ssize_t input_dev_show_properties(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *input_dev = to_input_dev(dev); int len = input_print_bitmap(buf, PAGE_SIZE, input_dev->propbit, INPUT_PROP_MAX, true); return min_t(int, len, PAGE_SIZE); } static DEVICE_ATTR(properties, S_IRUGO, input_dev_show_properties, NULL); static int input_inhibit_device(struct input_dev *dev); static int input_uninhibit_device(struct input_dev *dev); static ssize_t inhibited_show(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *input_dev = to_input_dev(dev); return sysfs_emit(buf, "%d\n", input_dev->inhibited); } static ssize_t inhibited_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct input_dev *input_dev = to_input_dev(dev); ssize_t rv; bool inhibited; if (kstrtobool(buf, &inhibited)) return -EINVAL; if (inhibited) rv = input_inhibit_device(input_dev); else rv = input_uninhibit_device(input_dev); if (rv != 0) return rv; return len; } static DEVICE_ATTR_RW(inhibited); static struct attribute *input_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_phys.attr, &dev_attr_uniq.attr, &dev_attr_modalias.attr, &dev_attr_properties.attr, &dev_attr_inhibited.attr, NULL }; static const struct attribute_group input_dev_attr_group = { .attrs = input_dev_attrs, }; #define INPUT_DEV_ID_ATTR(name) \ static ssize_t input_dev_show_id_##name(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct input_dev *input_dev = to_input_dev(dev); \ return sysfs_emit(buf, "%04x\n", input_dev->id.name); \ } \ static DEVICE_ATTR(name, S_IRUGO, input_dev_show_id_##name, NULL) INPUT_DEV_ID_ATTR(bustype); INPUT_DEV_ID_ATTR(vendor); INPUT_DEV_ID_ATTR(product); INPUT_DEV_ID_ATTR(version); static struct attribute *input_dev_id_attrs[] = { &dev_attr_bustype.attr, &dev_attr_vendor.attr, &dev_attr_product.attr, &dev_attr_version.attr, NULL }; static const struct attribute_group input_dev_id_attr_group = { .name = "id", .attrs = input_dev_id_attrs, }; static int input_print_bitmap(char *buf, int buf_size, const unsigned long *bitmap, int max, int add_cr) { int i; int len = 0; bool skip_empty = true; for (i = BITS_TO_LONGS(max) - 1; i >= 0; i--) { len += input_bits_to_string(buf + len, max(buf_size - len, 0), bitmap[i], skip_empty); if (len) { skip_empty = false; if (i > 0) len += snprintf(buf + len, max(buf_size - len, 0), " "); } } /* * If no output was produced print a single 0. */ if (len == 0) len = snprintf(buf, buf_size, "%d", 0); if (add_cr) len += snprintf(buf + len, max(buf_size - len, 0), "\n"); return len; } #define INPUT_DEV_CAP_ATTR(ev, bm) \ static ssize_t input_dev_show_cap_##bm(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct input_dev *input_dev = to_input_dev(dev); \ int len = input_print_bitmap(buf, PAGE_SIZE, \ input_dev->bm##bit, ev##_MAX, \ true); \ return min_t(int, len, PAGE_SIZE); \ } \ static DEVICE_ATTR(bm, S_IRUGO, input_dev_show_cap_##bm, NULL) INPUT_DEV_CAP_ATTR(EV, ev); INPUT_DEV_CAP_ATTR(KEY, key); INPUT_DEV_CAP_ATTR(REL, rel); INPUT_DEV_CAP_ATTR(ABS, abs); INPUT_DEV_CAP_ATTR(MSC, msc); INPUT_DEV_CAP_ATTR(LED, led); INPUT_DEV_CAP_ATTR(SND, snd); INPUT_DEV_CAP_ATTR(FF, ff); INPUT_DEV_CAP_ATTR(SW, sw); static struct attribute *input_dev_caps_attrs[] = { &dev_attr_ev.attr, &dev_attr_key.attr, &dev_attr_rel.attr, &dev_attr_abs.attr, &dev_attr_msc.attr, &dev_attr_led.attr, &dev_attr_snd.attr, &dev_attr_ff.attr, &dev_attr_sw.attr, NULL }; static const struct attribute_group input_dev_caps_attr_group = { .name = "capabilities", .attrs = input_dev_caps_attrs, }; static const struct attribute_group *input_dev_attr_groups[] = { &input_dev_attr_group, &input_dev_id_attr_group, &input_dev_caps_attr_group, &input_poller_attribute_group, NULL }; static void input_dev_release(struct device *device) { struct input_dev *dev = to_input_dev(device); input_ff_destroy(dev); input_mt_destroy_slots(dev); kfree(dev->poller); kfree(dev->absinfo); kfree(dev->vals); kfree(dev); module_put(THIS_MODULE); } /* * Input uevent interface - loading event handlers based on * device bitfields. */ static int input_add_uevent_bm_var(struct kobj_uevent_env *env, const char *name, const unsigned long *bitmap, int max) { int len; if (add_uevent_var(env, "%s", name)) return -ENOMEM; len = input_print_bitmap(&env->buf[env->buflen - 1], sizeof(env->buf) - env->buflen, bitmap, max, false); if (len >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += len; return 0; } /* * This is a pretty gross hack. When building uevent data the driver core * may try adding more environment variables to kobj_uevent_env without * telling us, so we have no idea how much of the buffer we can use to * avoid overflows/-ENOMEM elsewhere. To work around this let's artificially * reduce amount of memory we will use for the modalias environment variable. * * The potential additions are: * * SEQNUM=18446744073709551615 - (%llu - 28 bytes) * HOME=/ (6 bytes) * PATH=/sbin:/bin:/usr/sbin:/usr/bin (34 bytes) * * 68 bytes total. Allow extra buffer - 96 bytes */ #define UEVENT_ENV_EXTRA_LEN 96 static int input_add_uevent_modalias_var(struct kobj_uevent_env *env, const struct input_dev *dev) { int len; if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; len = input_print_modalias(&env->buf[env->buflen - 1], (int)sizeof(env->buf) - env->buflen - UEVENT_ENV_EXTRA_LEN, dev); if (len >= ((int)sizeof(env->buf) - env->buflen - UEVENT_ENV_EXTRA_LEN)) return -ENOMEM; env->buflen += len; return 0; } #define INPUT_ADD_HOTPLUG_VAR(fmt, val...) \ do { \ int err = add_uevent_var(env, fmt, val); \ if (err) \ return err; \ } while (0) #define INPUT_ADD_HOTPLUG_BM_VAR(name, bm, max) \ do { \ int err = input_add_uevent_bm_var(env, name, bm, max); \ if (err) \ return err; \ } while (0) #define INPUT_ADD_HOTPLUG_MODALIAS_VAR(dev) \ do { \ int err = input_add_uevent_modalias_var(env, dev); \ if (err) \ return err; \ } while (0) static int input_dev_uevent(const struct device *device, struct kobj_uevent_env *env) { const struct input_dev *dev = to_input_dev(device); INPUT_ADD_HOTPLUG_VAR("PRODUCT=%x/%x/%x/%x", dev->id.bustype, dev->id.vendor, dev->id.product, dev->id.version); if (dev->name) INPUT_ADD_HOTPLUG_VAR("NAME=\"%s\"", dev->name); if (dev->phys) INPUT_ADD_HOTPLUG_VAR("PHYS=\"%s\"", dev->phys); if (dev->uniq) INPUT_ADD_HOTPLUG_VAR("UNIQ=\"%s\"", dev->uniq); INPUT_ADD_HOTPLUG_BM_VAR("PROP=", dev->propbit, INPUT_PROP_MAX); INPUT_ADD_HOTPLUG_BM_VAR("EV=", dev->evbit, EV_MAX); if (test_bit(EV_KEY, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("KEY=", dev->keybit, KEY_MAX); if (test_bit(EV_REL, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("REL=", dev->relbit, REL_MAX); if (test_bit(EV_ABS, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("ABS=", dev->absbit, ABS_MAX); if (test_bit(EV_MSC, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("MSC=", dev->mscbit, MSC_MAX); if (test_bit(EV_LED, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("LED=", dev->ledbit, LED_MAX); if (test_bit(EV_SND, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("SND=", dev->sndbit, SND_MAX); if (test_bit(EV_FF, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("FF=", dev->ffbit, FF_MAX); if (test_bit(EV_SW, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("SW=", dev->swbit, SW_MAX); INPUT_ADD_HOTPLUG_MODALIAS_VAR(dev); return 0; } #define INPUT_DO_TOGGLE(dev, type, bits, on) \ do { \ int i; \ bool active; \ \ if (!test_bit(EV_##type, dev->evbit)) \ break; \ \ for_each_set_bit(i, dev->bits##bit, type##_CNT) { \ active = test_bit(i, dev->bits); \ if (!active && !on) \ continue; \ \ dev->event(dev, EV_##type, i, on ? active : 0); \ } \ } while (0) static void input_dev_toggle(struct input_dev *dev, bool activate) { if (!dev->event) return; INPUT_DO_TOGGLE(dev, LED, led, activate); INPUT_DO_TOGGLE(dev, SND, snd, activate); if (activate && test_bit(EV_REP, dev->evbit)) { dev->event(dev, EV_REP, REP_PERIOD, dev->rep[REP_PERIOD]); dev->event(dev, EV_REP, REP_DELAY, dev->rep[REP_DELAY]); } } /** * input_reset_device() - reset/restore the state of input device * @dev: input device whose state needs to be reset * * This function tries to reset the state of an opened input device and * bring internal state and state if the hardware in sync with each other. * We mark all keys as released, restore LED state, repeat rate, etc. */ void input_reset_device(struct input_dev *dev) { guard(mutex)(&dev->mutex); guard(spinlock_irqsave)(&dev->event_lock); input_dev_toggle(dev, true); if (input_dev_release_keys(dev)) input_handle_event(dev, EV_SYN, SYN_REPORT, 1); } EXPORT_SYMBOL(input_reset_device); static int input_inhibit_device(struct input_dev *dev) { guard(mutex)(&dev->mutex); if (dev->inhibited) return 0; if (dev->users) { if (dev->close) dev->close(dev); if (dev->poller) input_dev_poller_stop(dev->poller); } scoped_guard(spinlock_irq, &dev->event_lock) { input_mt_release_slots(dev); input_dev_release_keys(dev); input_handle_event(dev, EV_SYN, SYN_REPORT, 1); input_dev_toggle(dev, false); } dev->inhibited = true; return 0; } static int input_uninhibit_device(struct input_dev *dev) { int error; guard(mutex)(&dev->mutex); if (!dev->inhibited) return 0; if (dev->users) { if (dev->open) { error = dev->open(dev); if (error) return error; } if (dev->poller) input_dev_poller_start(dev->poller); } dev->inhibited = false; scoped_guard(spinlock_irq, &dev->event_lock) input_dev_toggle(dev, true); return 0; } static int input_dev_suspend(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* * Keys that are pressed now are unlikely to be * still pressed when we resume. */ if (input_dev_release_keys(input_dev)) input_handle_event(input_dev, EV_SYN, SYN_REPORT, 1); /* Turn off LEDs and sounds, if any are active. */ input_dev_toggle(input_dev, false); return 0; } static int input_dev_resume(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* Restore state of LEDs and sounds, if any were active. */ input_dev_toggle(input_dev, true); return 0; } static int input_dev_freeze(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* * Keys that are pressed now are unlikely to be * still pressed when we resume. */ if (input_dev_release_keys(input_dev)) input_handle_event(input_dev, EV_SYN, SYN_REPORT, 1); return 0; } static int input_dev_poweroff(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* Turn off LEDs and sounds, if any are active. */ input_dev_toggle(input_dev, false); return 0; } static const struct dev_pm_ops input_dev_pm_ops = { .suspend = input_dev_suspend, .resume = input_dev_resume, .freeze = input_dev_freeze, .poweroff = input_dev_poweroff, .restore = input_dev_resume, }; static const struct device_type input_dev_type = { .groups = input_dev_attr_groups, .release = input_dev_release, .uevent = input_dev_uevent, .pm = pm_sleep_ptr(&input_dev_pm_ops), }; static char *input_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "input/%s", dev_name(dev)); } const struct class input_class = { .name = "input", .devnode = input_devnode, }; EXPORT_SYMBOL_GPL(input_class); /** * input_allocate_device - allocate memory for new input device * * Returns prepared struct input_dev or %NULL. * * NOTE: Use input_free_device() to free devices that have not been * registered; input_unregister_device() should be used for already * registered devices. */ struct input_dev *input_allocate_device(void) { static atomic_t input_no = ATOMIC_INIT(-1); struct input_dev *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; /* * Start with space for SYN_REPORT + 7 EV_KEY/EV_MSC events + 2 spare, * see input_estimate_events_per_packet(). We will tune the number * when we register the device. */ dev->max_vals = 10; dev->vals = kcalloc(dev->max_vals, sizeof(*dev->vals), GFP_KERNEL); if (!dev->vals) { kfree(dev); return NULL; } mutex_init(&dev->mutex); spin_lock_init(&dev->event_lock); timer_setup(&dev->timer, NULL, 0); INIT_LIST_HEAD(&dev->h_list); INIT_LIST_HEAD(&dev->node); dev->dev.type = &input_dev_type; dev->dev.class = &input_class; device_initialize(&dev->dev); /* * From this point on we can no longer simply "kfree(dev)", we need * to use input_free_device() so that device core properly frees its * resources associated with the input device. */ dev_set_name(&dev->dev, "input%lu", (unsigned long)atomic_inc_return(&input_no)); __module_get(THIS_MODULE); return dev; } EXPORT_SYMBOL(input_allocate_device); struct input_devres { struct input_dev *input; }; static int devm_input_device_match(struct device *dev, void *res, void *data) { struct input_devres *devres = res; return devres->input == data; } static void devm_input_device_release(struct device *dev, void *res) { struct input_devres *devres = res; struct input_dev *input = devres->input; dev_dbg(dev, "%s: dropping reference to %s\n", __func__, dev_name(&input->dev)); input_put_device(input); } /** * devm_input_allocate_device - allocate managed input device * @dev: device owning the input device being created * * Returns prepared struct input_dev or %NULL. * * Managed input devices do not need to be explicitly unregistered or * freed as it will be done automatically when owner device unbinds from * its driver (or binding fails). Once managed input device is allocated, * it is ready to be set up and registered in the same fashion as regular * input device. There are no special devm_input_device_[un]register() * variants, regular ones work with both managed and unmanaged devices, * should you need them. In most cases however, managed input device need * not be explicitly unregistered or freed. * * NOTE: the owner device is set up as parent of input device and users * should not override it. */ struct input_dev *devm_input_allocate_device(struct device *dev) { struct input_dev *input; struct input_devres *devres; devres = devres_alloc(devm_input_device_release, sizeof(*devres), GFP_KERNEL); if (!devres) return NULL; input = input_allocate_device(); if (!input) { devres_free(devres); return NULL; } input->dev.parent = dev; input->devres_managed = true; devres->input = input; devres_add(dev, devres); return input; } EXPORT_SYMBOL(devm_input_allocate_device); /** * input_free_device - free memory occupied by input_dev structure * @dev: input device to free * * This function should only be used if input_register_device() * was not called yet or if it failed. Once device was registered * use input_unregister_device() and memory will be freed once last * reference to the device is dropped. * * Device should be allocated by input_allocate_device(). * * NOTE: If there are references to the input device then memory * will not be freed until last reference is dropped. */ void input_free_device(struct input_dev *dev) { if (dev) { if (dev->devres_managed) WARN_ON(devres_destroy(dev->dev.parent, devm_input_device_release, devm_input_device_match, dev)); input_put_device(dev); } } EXPORT_SYMBOL(input_free_device); /** * input_set_timestamp - set timestamp for input events * @dev: input device to set timestamp for * @timestamp: the time at which the event has occurred * in CLOCK_MONOTONIC * * This function is intended to provide to the input system a more * accurate time of when an event actually occurred. The driver should * call this function as soon as a timestamp is acquired ensuring * clock conversions in input_set_timestamp are done correctly. * * The system entering suspend state between timestamp acquisition and * calling input_set_timestamp can result in inaccurate conversions. */ void input_set_timestamp(struct input_dev *dev, ktime_t timestamp) { dev->timestamp[INPUT_CLK_MONO] = timestamp; dev->timestamp[INPUT_CLK_REAL] = ktime_mono_to_real(timestamp); dev->timestamp[INPUT_CLK_BOOT] = ktime_mono_to_any(timestamp, TK_OFFS_BOOT); } EXPORT_SYMBOL(input_set_timestamp); /** * input_get_timestamp - get timestamp for input events * @dev: input device to get timestamp from * * A valid timestamp is a timestamp of non-zero value. */ ktime_t *input_get_timestamp(struct input_dev *dev) { const ktime_t invalid_timestamp = ktime_set(0, 0); if (!ktime_compare(dev->timestamp[INPUT_CLK_MONO], invalid_timestamp)) input_set_timestamp(dev, ktime_get()); return dev->timestamp; } EXPORT_SYMBOL(input_get_timestamp); /** * input_set_capability - mark device as capable of a certain event * @dev: device that is capable of emitting or accepting event * @type: type of the event (EV_KEY, EV_REL, etc...) * @code: event code * * In addition to setting up corresponding bit in appropriate capability * bitmap the function also adjusts dev->evbit. */ void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code) { if (type < EV_CNT && input_max_code[type] && code > input_max_code[type]) { pr_err("%s: invalid code %u for type %u\n", __func__, code, type); dump_stack(); return; } switch (type) { case EV_KEY: __set_bit(code, dev->keybit); break; case EV_REL: __set_bit(code, dev->relbit); break; case EV_ABS: input_alloc_absinfo(dev); __set_bit(code, dev->absbit); break; case EV_MSC: __set_bit(code, dev->mscbit); break; case EV_SW: __set_bit(code, dev->swbit); break; case EV_LED: __set_bit(code, dev->ledbit); break; case EV_SND: __set_bit(code, dev->sndbit); break; case EV_FF: __set_bit(code, dev->ffbit); break; case EV_PWR: /* do nothing */ break; default: pr_err("%s: unknown type %u (code %u)\n", __func__, type, code); dump_stack(); return; } __set_bit(type, dev->evbit); } EXPORT_SYMBOL(input_set_capability); static unsigned int input_estimate_events_per_packet(struct input_dev *dev) { int mt_slots; int i; unsigned int events; if (dev->mt) { mt_slots = dev->mt->num_slots; } else if (test_bit(ABS_MT_TRACKING_ID, dev->absbit)) { mt_slots = dev->absinfo[ABS_MT_TRACKING_ID].maximum - dev->absinfo[ABS_MT_TRACKING_ID].minimum + 1; mt_slots = clamp(mt_slots, 2, 32); } else if (test_bit(ABS_MT_POSITION_X, dev->absbit)) { mt_slots = 2; } else { mt_slots = 0; } events = mt_slots + 1; /* count SYN_MT_REPORT and SYN_REPORT */ if (test_bit(EV_ABS, dev->evbit)) for_each_set_bit(i, dev->absbit, ABS_CNT) events += input_is_mt_axis(i) ? mt_slots : 1; if (test_bit(EV_REL, dev->evbit)) events += bitmap_weight(dev->relbit, REL_CNT); /* Make room for KEY and MSC events */ events += 7; return events; } #define INPUT_CLEANSE_BITMASK(dev, type, bits) \ do { \ if (!test_bit(EV_##type, dev->evbit)) \ memset(dev->bits##bit, 0, \ sizeof(dev->bits##bit)); \ } while (0) static void input_cleanse_bitmasks(struct input_dev *dev) { INPUT_CLEANSE_BITMASK(dev, KEY, key); INPUT_CLEANSE_BITMASK(dev, REL, rel); INPUT_CLEANSE_BITMASK(dev, ABS, abs); INPUT_CLEANSE_BITMASK(dev, MSC, msc); INPUT_CLEANSE_BITMASK(dev, LED, led); INPUT_CLEANSE_BITMASK(dev, SND, snd); INPUT_CLEANSE_BITMASK(dev, FF, ff); INPUT_CLEANSE_BITMASK(dev, SW, sw); } static void __input_unregister_device(struct input_dev *dev) { struct input_handle *handle, *next; input_disconnect_device(dev); scoped_guard(mutex, &input_mutex) { list_for_each_entry_safe(handle, next, &dev->h_list, d_node) handle->handler->disconnect(handle); WARN_ON(!list_empty(&dev->h_list)); del_timer_sync(&dev->timer); list_del_init(&dev->node); input_wakeup_procfs_readers(); } device_del(&dev->dev); } static void devm_input_device_unregister(struct device *dev, void *res) { struct input_devres *devres = res; struct input_dev *input = devres->input; dev_dbg(dev, "%s: unregistering device %s\n", __func__, dev_name(&input->dev)); __input_unregister_device(input); } /* * Generate software autorepeat event. Note that we take * dev->event_lock here to avoid racing with input_event * which may cause keys get "stuck". */ static void input_repeat_key(struct timer_list *t) { struct input_dev *dev = from_timer(dev, t, timer); guard(spinlock_irqsave)(&dev->event_lock); if (!dev->inhibited && test_bit(dev->repeat_key, dev->key) && is_event_supported(dev->repeat_key, dev->keybit, KEY_MAX)) { input_set_timestamp(dev, ktime_get()); input_handle_event(dev, EV_KEY, dev->repeat_key, 2); input_handle_event(dev, EV_SYN, SYN_REPORT, 1); if (dev->rep[REP_PERIOD]) mod_timer(&dev->timer, jiffies + msecs_to_jiffies(dev->rep[REP_PERIOD])); } } /** * input_enable_softrepeat - enable software autorepeat * @dev: input device * @delay: repeat delay * @period: repeat period * * Enable software autorepeat on the input device. */ void input_enable_softrepeat(struct input_dev *dev, int delay, int period) { dev->timer.function = input_repeat_key; dev->rep[REP_DELAY] = delay; dev->rep[REP_PERIOD] = period; } EXPORT_SYMBOL(input_enable_softrepeat); bool input_device_enabled(struct input_dev *dev) { lockdep_assert_held(&dev->mutex); return !dev->inhibited && dev->users > 0; } EXPORT_SYMBOL_GPL(input_device_enabled); static int input_device_tune_vals(struct input_dev *dev) { struct input_value *vals; unsigned int packet_size; unsigned int max_vals; packet_size = input_estimate_events_per_packet(dev); if (dev->hint_events_per_packet < packet_size) dev->hint_events_per_packet = packet_size; max_vals = dev->hint_events_per_packet + 2; if (dev->max_vals >= max_vals) return 0; vals = kcalloc(max_vals, sizeof(*vals), GFP_KERNEL); if (!vals) return -ENOMEM; scoped_guard(spinlock_irq, &dev->event_lock) { dev->max_vals = max_vals; swap(dev->vals, vals); } /* Because of swap() above, this frees the old vals memory */ kfree(vals); return 0; } /** * input_register_device - register device with input core * @dev: device to be registered * * This function registers device with input core. The device must be * allocated with input_allocate_device() and all it's capabilities * set up before registering. * If function fails the device must be freed with input_free_device(). * Once device has been successfully registered it can be unregistered * with input_unregister_device(); input_free_device() should not be * called in this case. * * Note that this function is also used to register managed input devices * (ones allocated with devm_input_allocate_device()). Such managed input * devices need not be explicitly unregistered or freed, their tear down * is controlled by the devres infrastructure. It is also worth noting * that tear down of managed input devices is internally a 2-step process: * registered managed input device is first unregistered, but stays in * memory and can still handle input_event() calls (although events will * not be delivered anywhere). The freeing of managed input device will * happen later, when devres stack is unwound to the point where device * allocation was made. */ int input_register_device(struct input_dev *dev) { struct input_devres *devres = NULL; struct input_handler *handler; const char *path; int error; if (test_bit(EV_ABS, dev->evbit) && !dev->absinfo) { dev_err(&dev->dev, "Absolute device without dev->absinfo, refusing to register\n"); return -EINVAL; } if (dev->devres_managed) { devres = devres_alloc(devm_input_device_unregister, sizeof(*devres), GFP_KERNEL); if (!devres) return -ENOMEM; devres->input = dev; } /* Every input device generates EV_SYN/SYN_REPORT events. */ __set_bit(EV_SYN, dev->evbit); /* KEY_RESERVED is not supposed to be transmitted to userspace. */ __clear_bit(KEY_RESERVED, dev->keybit); /* Make sure that bitmasks not mentioned in dev->evbit are clean. */ input_cleanse_bitmasks(dev); error = input_device_tune_vals(dev); if (error) goto err_devres_free; /* * If delay and period are pre-set by the driver, then autorepeating * is handled by the driver itself and we don't do it in input.c. */ if (!dev->rep[REP_DELAY] && !dev->rep[REP_PERIOD]) input_enable_softrepeat(dev, 250, 33); if (!dev->getkeycode) dev->getkeycode = input_default_getkeycode; if (!dev->setkeycode) dev->setkeycode = input_default_setkeycode; if (dev->poller) input_dev_poller_finalize(dev->poller); error = device_add(&dev->dev); if (error) goto err_devres_free; path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); pr_info("%s as %s\n", dev->name ? dev->name : "Unspecified device", path ? path : "N/A"); kfree(path); error = -EINTR; scoped_cond_guard(mutex_intr, goto err_device_del, &input_mutex) { list_add_tail(&dev->node, &input_dev_list); list_for_each_entry(handler, &input_handler_list, node) input_attach_handler(dev, handler); input_wakeup_procfs_readers(); } if (dev->devres_managed) { dev_dbg(dev->dev.parent, "%s: registering %s with devres.\n", __func__, dev_name(&dev->dev)); devres_add(dev->dev.parent, devres); } return 0; err_device_del: device_del(&dev->dev); err_devres_free: devres_free(devres); return error; } EXPORT_SYMBOL(input_register_device); /** * input_unregister_device - unregister previously registered device * @dev: device to be unregistered * * This function unregisters an input device. Once device is unregistered * the caller should not try to access it as it may get freed at any moment. */ void input_unregister_device(struct input_dev *dev) { if (dev->devres_managed) { WARN_ON(devres_destroy(dev->dev.parent, devm_input_device_unregister, devm_input_device_match, dev)); __input_unregister_device(dev); /* * We do not do input_put_device() here because it will be done * when 2nd devres fires up. */ } else { __input_unregister_device(dev); input_put_device(dev); } } EXPORT_SYMBOL(input_unregister_device); static int input_handler_check_methods(const struct input_handler *handler) { int count = 0; if (handler->filter) count++; if (handler->events) count++; if (handler->event) count++; if (count > 1) { pr_err("%s: only one event processing method can be defined (%s)\n", __func__, handler->name); return -EINVAL; } return 0; } /** * input_register_handler - register a new input handler * @handler: handler to be registered * * This function registers a new input handler (interface) for input * devices in the system and attaches it to all input devices that * are compatible with the handler. */ int input_register_handler(struct input_handler *handler) { struct input_dev *dev; int error; error = input_handler_check_methods(handler); if (error) return error; scoped_cond_guard(mutex_intr, return -EINTR, &input_mutex) { INIT_LIST_HEAD(&handler->h_list); list_add_tail(&handler->node, &input_handler_list); list_for_each_entry(dev, &input_dev_list, node) input_attach_handler(dev, handler); input_wakeup_procfs_readers(); } return 0; } EXPORT_SYMBOL(input_register_handler); /** * input_unregister_handler - unregisters an input handler * @handler: handler to be unregistered * * This function disconnects a handler from its input devices and * removes it from lists of known handlers. */ void input_unregister_handler(struct input_handler *handler) { struct input_handle *handle, *next; guard(mutex)(&input_mutex); list_for_each_entry_safe(handle, next, &handler->h_list, h_node) handler->disconnect(handle); WARN_ON(!list_empty(&handler->h_list)); list_del_init(&handler->node); input_wakeup_procfs_readers(); } EXPORT_SYMBOL(input_unregister_handler); /** * input_handler_for_each_handle - handle iterator * @handler: input handler to iterate * @data: data for the callback * @fn: function to be called for each handle * * Iterate over @bus's list of devices, and call @fn for each, passing * it @data and stop when @fn returns a non-zero value. The function is * using RCU to traverse the list and therefore may be using in atomic * contexts. The @fn callback is invoked from RCU critical section and * thus must not sleep. */ int input_handler_for_each_handle(struct input_handler *handler, void *data, int (*fn)(struct input_handle *, void *)) { struct input_handle *handle; int retval; guard(rcu)(); list_for_each_entry_rcu(handle, &handler->h_list, h_node) { retval = fn(handle, data); if (retval) return retval; } return 0; } EXPORT_SYMBOL(input_handler_for_each_handle); /* * An implementation of input_handle's handle_events() method that simply * invokes handler->event() method for each event one by one. */ static unsigned int input_handle_events_default(struct input_handle *handle, struct input_value *vals, unsigned int count) { struct input_handler *handler = handle->handler; struct input_value *v; for (v = vals; v != vals + count; v++) handler->event(handle, v->type, v->code, v->value); return count; } /* * An implementation of input_handle's handle_events() method that invokes * handler->filter() method for each event one by one and removes events * that were filtered out from the "vals" array. */ static unsigned int input_handle_events_filter(struct input_handle *handle, struct input_value *vals, unsigned int count) { struct input_handler *handler = handle->handler; struct input_value *end = vals; struct input_value *v; for (v = vals; v != vals + count; v++) { if (handler->filter(handle, v->type, v->code, v->value)) continue; if (end != v) *end = *v; end++; } return end - vals; } /* * An implementation of input_handle's handle_events() method that does nothing. */ static unsigned int input_handle_events_null(struct input_handle *handle, struct input_value *vals, unsigned int count) { return count; } /* * Sets up appropriate handle->event_handler based on the input_handler * associated with the handle. */ static void input_handle_setup_event_handler(struct input_handle *handle) { struct input_handler *handler = handle->handler; if (handler->filter) handle->handle_events = input_handle_events_filter; else if (handler->event) handle->handle_events = input_handle_events_default; else if (handler->events) handle->handle_events = handler->events; else handle->handle_events = input_handle_events_null; } /** * input_register_handle - register a new input handle * @handle: handle to register * * This function puts a new input handle onto device's * and handler's lists so that events can flow through * it once it is opened using input_open_device(). * * This function is supposed to be called from handler's * connect() method. */ int input_register_handle(struct input_handle *handle) { struct input_handler *handler = handle->handler; struct input_dev *dev = handle->dev; input_handle_setup_event_handler(handle); /* * We take dev->mutex here to prevent race with * input_release_device(). */ scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { /* * Filters go to the head of the list, normal handlers * to the tail. */ if (handler->filter) list_add_rcu(&handle->d_node, &dev->h_list); else list_add_tail_rcu(&handle->d_node, &dev->h_list); } /* * Since we are supposed to be called from ->connect() * which is mutually exclusive with ->disconnect() * we can't be racing with input_unregister_handle() * and so separate lock is not needed here. */ list_add_tail_rcu(&handle->h_node, &handler->h_list); if (handler->start) handler->start(handle); return 0; } EXPORT_SYMBOL(input_register_handle); /** * input_unregister_handle - unregister an input handle * @handle: handle to unregister * * This function removes input handle from device's * and handler's lists. * * This function is supposed to be called from handler's * disconnect() method. */ void input_unregister_handle(struct input_handle *handle) { struct input_dev *dev = handle->dev; list_del_rcu(&handle->h_node); /* * Take dev->mutex to prevent race with input_release_device(). */ scoped_guard(mutex, &dev->mutex) list_del_rcu(&handle->d_node); synchronize_rcu(); } EXPORT_SYMBOL(input_unregister_handle); /** * input_get_new_minor - allocates a new input minor number * @legacy_base: beginning or the legacy range to be searched * @legacy_num: size of legacy range * @allow_dynamic: whether we can also take ID from the dynamic range * * This function allocates a new device minor for from input major namespace. * Caller can request legacy minor by specifying @legacy_base and @legacy_num * parameters and whether ID can be allocated from dynamic range if there are * no free IDs in legacy range. */ int input_get_new_minor(int legacy_base, unsigned int legacy_num, bool allow_dynamic) { /* * This function should be called from input handler's ->connect() * methods, which are serialized with input_mutex, so no additional * locking is needed here. */ if (legacy_base >= 0) { int minor = ida_alloc_range(&input_ida, legacy_base, legacy_base + legacy_num - 1, GFP_KERNEL); if (minor >= 0 || !allow_dynamic) return minor; } return ida_alloc_range(&input_ida, INPUT_FIRST_DYNAMIC_DEV, INPUT_MAX_CHAR_DEVICES - 1, GFP_KERNEL); } EXPORT_SYMBOL(input_get_new_minor); /** * input_free_minor - release previously allocated minor * @minor: minor to be released * * This function releases previously allocated input minor so that it can be * reused later. */ void input_free_minor(unsigned int minor) { ida_free(&input_ida, minor); } EXPORT_SYMBOL(input_free_minor); static int __init input_init(void) { int err; err = class_register(&input_class); if (err) { pr_err("unable to register input_dev class\n"); return err; } err = input_proc_init(); if (err) goto fail1; err = register_chrdev_region(MKDEV(INPUT_MAJOR, 0), INPUT_MAX_CHAR_DEVICES, "input"); if (err) { pr_err("unable to register char major %d", INPUT_MAJOR); goto fail2; } return 0; fail2: input_proc_exit(); fail1: class_unregister(&input_class); return err; } static void __exit input_exit(void) { input_proc_exit(); unregister_chrdev_region(MKDEV(INPUT_MAJOR, 0), INPUT_MAX_CHAR_DEVICES); class_unregister(&input_class); } subsys_initcall(input_init); module_exit(input_exit);
2 4 2 2 1 2 1 1 1 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 // SPDX-License-Identifier: GPL-2.0-or-later /* * Apple "Magic" Wireless Mouse driver * * Copyright (c) 2010 Michael Poole <mdpoole@troilus.org> * Copyright (c) 2010 Chase Douglas <chase.douglas@canonical.com> */ /* */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/hid.h> #include <linux/input/mt.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/workqueue.h> #include "hid-ids.h" static bool emulate_3button = true; module_param(emulate_3button, bool, 0644); MODULE_PARM_DESC(emulate_3button, "Emulate a middle button"); static int middle_button_start = -350; static int middle_button_stop = +350; static bool emulate_scroll_wheel = true; module_param(emulate_scroll_wheel, bool, 0644); MODULE_PARM_DESC(emulate_scroll_wheel, "Emulate a scroll wheel"); static unsigned int scroll_speed = 32; static int param_set_scroll_speed(const char *val, const struct kernel_param *kp) { unsigned long speed; if (!val || kstrtoul(val, 0, &speed) || speed > 63) return -EINVAL; scroll_speed = speed; return 0; } module_param_call(scroll_speed, param_set_scroll_speed, param_get_uint, &scroll_speed, 0644); MODULE_PARM_DESC(scroll_speed, "Scroll speed, value from 0 (slow) to 63 (fast)"); static bool scroll_acceleration = false; module_param(scroll_acceleration, bool, 0644); MODULE_PARM_DESC(scroll_acceleration, "Accelerate sequential scroll events"); static bool report_undeciphered; module_param(report_undeciphered, bool, 0644); MODULE_PARM_DESC(report_undeciphered, "Report undeciphered multi-touch state field using a MSC_RAW event"); #define TRACKPAD2_2021_BT_VERSION 0x110 #define TRACKPAD_2024_BT_VERSION 0x314 #define TRACKPAD_REPORT_ID 0x28 #define TRACKPAD2_USB_REPORT_ID 0x02 #define TRACKPAD2_BT_REPORT_ID 0x31 #define MOUSE_REPORT_ID 0x29 #define MOUSE2_REPORT_ID 0x12 #define DOUBLE_REPORT_ID 0xf7 #define USB_BATTERY_TIMEOUT_MS 60000 /* These definitions are not precise, but they're close enough. (Bits * 0x03 seem to indicate the aspect ratio of the touch, bits 0x70 seem * to be some kind of bit mask -- 0x20 may be a near-field reading, * and 0x40 is actual contact, and 0x10 may be a start/stop or change * indication.) */ #define TOUCH_STATE_MASK 0xf0 #define TOUCH_STATE_NONE 0x00 #define TOUCH_STATE_START 0x30 #define TOUCH_STATE_DRAG 0x40 /* Number of high-resolution events for each low-resolution detent. */ #define SCROLL_HR_STEPS 10 #define SCROLL_HR_MULT (120 / SCROLL_HR_STEPS) #define SCROLL_HR_THRESHOLD 90 /* units */ #define SCROLL_ACCEL_DEFAULT 7 /* Touch surface information. Dimension is in hundredths of a mm, min and max * are in units. */ #define MOUSE_DIMENSION_X (float)9056 #define MOUSE_MIN_X -1100 #define MOUSE_MAX_X 1258 #define MOUSE_RES_X ((MOUSE_MAX_X - MOUSE_MIN_X) / (MOUSE_DIMENSION_X / 100)) #define MOUSE_DIMENSION_Y (float)5152 #define MOUSE_MIN_Y -1589 #define MOUSE_MAX_Y 2047 #define MOUSE_RES_Y ((MOUSE_MAX_Y - MOUSE_MIN_Y) / (MOUSE_DIMENSION_Y / 100)) #define TRACKPAD_DIMENSION_X (float)13000 #define TRACKPAD_MIN_X -2909 #define TRACKPAD_MAX_X 3167 #define TRACKPAD_RES_X \ ((TRACKPAD_MAX_X - TRACKPAD_MIN_X) / (TRACKPAD_DIMENSION_X / 100)) #define TRACKPAD_DIMENSION_Y (float)11000 #define TRACKPAD_MIN_Y -2456 #define TRACKPAD_MAX_Y 2565 #define TRACKPAD_RES_Y \ ((TRACKPAD_MAX_Y - TRACKPAD_MIN_Y) / (TRACKPAD_DIMENSION_Y / 100)) #define TRACKPAD2_DIMENSION_X (float)16000 #define TRACKPAD2_MIN_X -3678 #define TRACKPAD2_MAX_X 3934 #define TRACKPAD2_RES_X \ ((TRACKPAD2_MAX_X - TRACKPAD2_MIN_X) / (TRACKPAD2_DIMENSION_X / 100)) #define TRACKPAD2_DIMENSION_Y (float)11490 #define TRACKPAD2_MIN_Y -2478 #define TRACKPAD2_MAX_Y 2587 #define TRACKPAD2_RES_Y \ ((TRACKPAD2_MAX_Y - TRACKPAD2_MIN_Y) / (TRACKPAD2_DIMENSION_Y / 100)) /** * struct magicmouse_sc - Tracks Magic Mouse-specific data. * @input: Input device through which we report events. * @quirks: Currently unused. * @ntouches: Number of touches in most recent touch report. * @scroll_accel: Number of consecutive scroll motions. * @scroll_jiffies: Time of last scroll motion. * @touches: Most recent data for a touch, indexed by tracking ID. * @tracking_ids: Mapping of current touch input data to @touches. * @hdev: Pointer to the underlying HID device. * @work: Workqueue to handle initialization retry for quirky devices. * @battery_timer: Timer for obtaining battery level information. */ struct magicmouse_sc { struct input_dev *input; unsigned long quirks; int ntouches; int scroll_accel; unsigned long scroll_jiffies; struct { short x; short y; short scroll_x; short scroll_y; short scroll_x_hr; short scroll_y_hr; u8 size; bool scroll_x_active; bool scroll_y_active; } touches[16]; int tracking_ids[16]; struct hid_device *hdev; struct delayed_work work; struct timer_list battery_timer; }; static int magicmouse_firm_touch(struct magicmouse_sc *msc) { int touch = -1; int ii; /* If there is only one "firm" touch, set touch to its * tracking ID. */ for (ii = 0; ii < msc->ntouches; ii++) { int idx = msc->tracking_ids[ii]; if (msc->touches[idx].size < 8) { /* Ignore this touch. */ } else if (touch >= 0) { touch = -1; break; } else { touch = idx; } } return touch; } static void magicmouse_emit_buttons(struct magicmouse_sc *msc, int state) { int last_state = test_bit(BTN_LEFT, msc->input->key) << 0 | test_bit(BTN_RIGHT, msc->input->key) << 1 | test_bit(BTN_MIDDLE, msc->input->key) << 2; if (emulate_3button) { int id; /* If some button was pressed before, keep it held * down. Otherwise, if there's exactly one firm * touch, use that to override the mouse's guess. */ if (state == 0) { /* The button was released. */ } else if (last_state != 0) { state = last_state; } else if ((id = magicmouse_firm_touch(msc)) >= 0) { int x = msc->touches[id].x; if (x < middle_button_start) state = 1; else if (x > middle_button_stop) state = 2; else state = 4; } /* else: we keep the mouse's guess */ input_report_key(msc->input, BTN_MIDDLE, state & 4); } input_report_key(msc->input, BTN_LEFT, state & 1); input_report_key(msc->input, BTN_RIGHT, state & 2); if (state != last_state) msc->scroll_accel = SCROLL_ACCEL_DEFAULT; } static void magicmouse_emit_touch(struct magicmouse_sc *msc, int raw_id, u8 *tdata) { struct input_dev *input = msc->input; int id, x, y, size, orientation, touch_major, touch_minor, state, down; int pressure = 0; if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { id = (tdata[6] << 2 | tdata[5] >> 6) & 0xf; x = (tdata[1] << 28 | tdata[0] << 20) >> 20; y = -((tdata[2] << 24 | tdata[1] << 16) >> 20); size = tdata[5] & 0x3f; orientation = (tdata[6] >> 2) - 32; touch_major = tdata[3]; touch_minor = tdata[4]; state = tdata[7] & TOUCH_STATE_MASK; down = state != TOUCH_STATE_NONE; } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { id = tdata[8] & 0xf; x = (tdata[1] << 27 | tdata[0] << 19) >> 19; y = -((tdata[3] << 30 | tdata[2] << 22 | tdata[1] << 14) >> 19); size = tdata[6]; orientation = (tdata[8] >> 5) - 4; touch_major = tdata[4]; touch_minor = tdata[5]; pressure = tdata[7]; state = tdata[3] & 0xC0; down = state == 0x80; } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ id = (tdata[7] << 2 | tdata[6] >> 6) & 0xf; x = (tdata[1] << 27 | tdata[0] << 19) >> 19; y = -((tdata[3] << 30 | tdata[2] << 22 | tdata[1] << 14) >> 19); size = tdata[6] & 0x3f; orientation = (tdata[7] >> 2) - 32; touch_major = tdata[4]; touch_minor = tdata[5]; state = tdata[8] & TOUCH_STATE_MASK; down = state != TOUCH_STATE_NONE; } /* Store tracking ID and other fields. */ msc->tracking_ids[raw_id] = id; msc->touches[id].x = x; msc->touches[id].y = y; msc->touches[id].size = size; /* If requested, emulate a scroll wheel by detecting small * vertical touch motions. */ if (emulate_scroll_wheel && input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 && input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { unsigned long now = jiffies; int step_x = msc->touches[id].scroll_x - x; int step_y = msc->touches[id].scroll_y - y; int step_hr = max_t(int, ((64 - (int)scroll_speed) * msc->scroll_accel) / SCROLL_HR_STEPS, 1); int step_x_hr = msc->touches[id].scroll_x_hr - x; int step_y_hr = msc->touches[id].scroll_y_hr - y; /* Calculate and apply the scroll motion. */ switch (state) { case TOUCH_STATE_START: msc->touches[id].scroll_x = x; msc->touches[id].scroll_y = y; msc->touches[id].scroll_x_hr = x; msc->touches[id].scroll_y_hr = y; msc->touches[id].scroll_x_active = false; msc->touches[id].scroll_y_active = false; /* Reset acceleration after half a second. */ if (scroll_acceleration && time_before(now, msc->scroll_jiffies + HZ / 2)) msc->scroll_accel = max_t(int, msc->scroll_accel - 1, 1); else msc->scroll_accel = SCROLL_ACCEL_DEFAULT; break; case TOUCH_STATE_DRAG: step_x /= (64 - (int)scroll_speed) * msc->scroll_accel; if (step_x != 0) { msc->touches[id].scroll_x -= step_x * (64 - scroll_speed) * msc->scroll_accel; msc->scroll_jiffies = now; input_report_rel(input, REL_HWHEEL, -step_x); } step_y /= (64 - (int)scroll_speed) * msc->scroll_accel; if (step_y != 0) { msc->touches[id].scroll_y -= step_y * (64 - scroll_speed) * msc->scroll_accel; msc->scroll_jiffies = now; input_report_rel(input, REL_WHEEL, step_y); } if (!msc->touches[id].scroll_x_active && abs(step_x_hr) > SCROLL_HR_THRESHOLD) { msc->touches[id].scroll_x_active = true; msc->touches[id].scroll_x_hr = x; step_x_hr = 0; } step_x_hr /= step_hr; if (step_x_hr != 0 && msc->touches[id].scroll_x_active) { msc->touches[id].scroll_x_hr -= step_x_hr * step_hr; input_report_rel(input, REL_HWHEEL_HI_RES, -step_x_hr * SCROLL_HR_MULT); } if (!msc->touches[id].scroll_y_active && abs(step_y_hr) > SCROLL_HR_THRESHOLD) { msc->touches[id].scroll_y_active = true; msc->touches[id].scroll_y_hr = y; step_y_hr = 0; } step_y_hr /= step_hr; if (step_y_hr != 0 && msc->touches[id].scroll_y_active) { msc->touches[id].scroll_y_hr -= step_y_hr * step_hr; input_report_rel(input, REL_WHEEL_HI_RES, step_y_hr * SCROLL_HR_MULT); } break; } } if (down) msc->ntouches++; input_mt_slot(input, id); input_mt_report_slot_state(input, MT_TOOL_FINGER, down); /* Generate the input events for this touch. */ if (down) { input_report_abs(input, ABS_MT_TOUCH_MAJOR, touch_major << 2); input_report_abs(input, ABS_MT_TOUCH_MINOR, touch_minor << 2); input_report_abs(input, ABS_MT_ORIENTATION, -orientation); input_report_abs(input, ABS_MT_POSITION_X, x); input_report_abs(input, ABS_MT_POSITION_Y, y); if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) input_report_abs(input, ABS_MT_PRESSURE, pressure); if (report_undeciphered) { if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) input_event(input, EV_MSC, MSC_RAW, tdata[7]); else if (input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 && input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) input_event(input, EV_MSC, MSC_RAW, tdata[8]); } } } static int magicmouse_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); struct input_dev *input = msc->input; int x = 0, y = 0, ii, clicks = 0, npoints; switch (data[0]) { case TRACKPAD_REPORT_ID: case TRACKPAD2_BT_REPORT_ID: /* Expect four bytes of prefix, and N*9 bytes of touch data. */ if (size < 4 || ((size - 4) % 9) != 0) return 0; npoints = (size - 4) / 9; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for TRACKPAD_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 9 + 4); clicks = data[1]; /* The following bits provide a device specific timestamp. They * are unused here. * * ts = data[1] >> 6 | data[2] << 2 | data[3] << 10; */ break; case TRACKPAD2_USB_REPORT_ID: /* Expect twelve bytes of prefix and N*9 bytes of touch data. */ if (size < 12 || ((size - 12) % 9) != 0) return 0; npoints = (size - 12) / 9; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for TRACKPAD2_USB_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 9 + 12); clicks = data[1]; break; case MOUSE_REPORT_ID: /* Expect six bytes of prefix, and N*8 bytes of touch data. */ if (size < 6 || ((size - 6) % 8) != 0) return 0; npoints = (size - 6) / 8; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for MOUSE_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 8 + 6); /* When emulating three-button mode, it is important * to have the current touch information before * generating a click event. */ x = (int)(((data[3] & 0x0c) << 28) | (data[1] << 22)) >> 22; y = (int)(((data[3] & 0x30) << 26) | (data[2] << 22)) >> 22; clicks = data[3]; /* The following bits provide a device specific timestamp. They * are unused here. * * ts = data[3] >> 6 | data[4] << 2 | data[5] << 10; */ break; case MOUSE2_REPORT_ID: /* Size is either 8 or (14 + 8 * N) */ if (size != 8 && (size < 14 || (size - 14) % 8 != 0)) return 0; npoints = (size - 14) / 8; if (npoints > 15) { hid_warn(hdev, "invalid size value (%d) for MOUSE2_REPORT_ID\n", size); return 0; } msc->ntouches = 0; for (ii = 0; ii < npoints; ii++) magicmouse_emit_touch(msc, ii, data + ii * 8 + 14); /* When emulating three-button mode, it is important * to have the current touch information before * generating a click event. */ x = (int)((data[3] << 24) | (data[2] << 16)) >> 16; y = (int)((data[5] << 24) | (data[4] << 16)) >> 16; clicks = data[1]; /* The following bits provide a device specific timestamp. They * are unused here. * * ts = data[11] >> 6 | data[12] << 2 | data[13] << 10; */ break; case DOUBLE_REPORT_ID: /* Sometimes the trackpad sends two touch reports in one * packet. */ magicmouse_raw_event(hdev, report, data + 2, data[1]); magicmouse_raw_event(hdev, report, data + 2 + data[1], size - 2 - data[1]); return 0; default: return 0; } if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { magicmouse_emit_buttons(msc, clicks & 3); input_report_rel(input, REL_X, x); input_report_rel(input, REL_Y, y); } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { input_mt_sync_frame(input); input_report_key(input, BTN_MOUSE, clicks & 1); } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ input_report_key(input, BTN_MOUSE, clicks & 1); input_mt_report_pointer_emulation(input, true); } input_sync(input); return 1; } static int magicmouse_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); if (msc->input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2 && field->report->id == MOUSE2_REPORT_ID) { /* * magic_mouse_raw_event has done all the work. Skip hidinput. * * Specifically, hidinput may modify BTN_LEFT and BTN_RIGHT, * breaking emulate_3button. */ return 1; } return 0; } static int magicmouse_setup_input(struct input_dev *input, struct hid_device *hdev) { int error; int mt_flags = 0; __set_bit(EV_KEY, input->evbit); if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { __set_bit(BTN_LEFT, input->keybit); __set_bit(BTN_RIGHT, input->keybit); if (emulate_3button) __set_bit(BTN_MIDDLE, input->keybit); __set_bit(EV_REL, input->evbit); __set_bit(REL_X, input->relbit); __set_bit(REL_Y, input->relbit); if (emulate_scroll_wheel) { __set_bit(REL_WHEEL, input->relbit); __set_bit(REL_HWHEEL, input->relbit); __set_bit(REL_WHEEL_HI_RES, input->relbit); __set_bit(REL_HWHEEL_HI_RES, input->relbit); } } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { /* If the trackpad has been connected to a Mac, the name is * automatically personalized, e.g., "José Expósito's Trackpad". * When connected through Bluetooth, the personalized name is * reported, however, when connected through USB the generic * name is reported. * Set the device name to ensure the same driver settings get * loaded, whether connected through bluetooth or USB. */ if (hdev->vendor == BT_VENDOR_ID_APPLE) { if (input->id.version == TRACKPAD2_2021_BT_VERSION) input->name = "Apple Inc. Magic Trackpad 2021"; else if (input->id.version == TRACKPAD_2024_BT_VERSION) { input->name = "Apple Inc. Magic Trackpad USB-C"; } else { input->name = "Apple Inc. Magic Trackpad"; } } else { /* USB_VENDOR_ID_APPLE */ input->name = hdev->name; } __clear_bit(EV_MSC, input->evbit); __clear_bit(BTN_0, input->keybit); __clear_bit(BTN_RIGHT, input->keybit); __clear_bit(BTN_MIDDLE, input->keybit); __set_bit(BTN_MOUSE, input->keybit); __set_bit(INPUT_PROP_BUTTONPAD, input->propbit); __set_bit(BTN_TOOL_FINGER, input->keybit); mt_flags = INPUT_MT_POINTER | INPUT_MT_DROP_UNUSED | INPUT_MT_TRACK; } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ /* input->keybit is initialized with incorrect button info * for Magic Trackpad. There really is only one physical * button (BTN_LEFT == BTN_MOUSE). Make sure we don't * advertise buttons that don't exist... */ __clear_bit(BTN_RIGHT, input->keybit); __clear_bit(BTN_MIDDLE, input->keybit); __set_bit(BTN_MOUSE, input->keybit); __set_bit(BTN_TOOL_FINGER, input->keybit); __set_bit(BTN_TOOL_DOUBLETAP, input->keybit); __set_bit(BTN_TOOL_TRIPLETAP, input->keybit); __set_bit(BTN_TOOL_QUADTAP, input->keybit); __set_bit(BTN_TOOL_QUINTTAP, input->keybit); __set_bit(BTN_TOUCH, input->keybit); __set_bit(INPUT_PROP_POINTER, input->propbit); __set_bit(INPUT_PROP_BUTTONPAD, input->propbit); } __set_bit(EV_ABS, input->evbit); error = input_mt_init_slots(input, 16, mt_flags); if (error) return error; input_set_abs_params(input, ABS_MT_TOUCH_MAJOR, 0, 255 << 2, 4, 0); input_set_abs_params(input, ABS_MT_TOUCH_MINOR, 0, 255 << 2, 4, 0); /* Note: Touch Y position from the device is inverted relative * to how pointer motion is reported (and relative to how USB * HID recommends the coordinates work). This driver keeps * the origin at the same position, and just uses the additive * inverse of the reported Y. */ if (input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE || input->id.product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { input_set_abs_params(input, ABS_MT_ORIENTATION, -31, 32, 1, 0); input_set_abs_params(input, ABS_MT_POSITION_X, MOUSE_MIN_X, MOUSE_MAX_X, 4, 0); input_set_abs_params(input, ABS_MT_POSITION_Y, MOUSE_MIN_Y, MOUSE_MAX_Y, 4, 0); input_abs_set_res(input, ABS_MT_POSITION_X, MOUSE_RES_X); input_abs_set_res(input, ABS_MT_POSITION_Y, MOUSE_RES_Y); } else if (input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { input_set_abs_params(input, ABS_MT_PRESSURE, 0, 253, 0, 0); input_set_abs_params(input, ABS_PRESSURE, 0, 253, 0, 0); input_set_abs_params(input, ABS_MT_ORIENTATION, -3, 4, 0, 0); input_set_abs_params(input, ABS_X, TRACKPAD2_MIN_X, TRACKPAD2_MAX_X, 0, 0); input_set_abs_params(input, ABS_Y, TRACKPAD2_MIN_Y, TRACKPAD2_MAX_Y, 0, 0); input_set_abs_params(input, ABS_MT_POSITION_X, TRACKPAD2_MIN_X, TRACKPAD2_MAX_X, 0, 0); input_set_abs_params(input, ABS_MT_POSITION_Y, TRACKPAD2_MIN_Y, TRACKPAD2_MAX_Y, 0, 0); input_abs_set_res(input, ABS_X, TRACKPAD2_RES_X); input_abs_set_res(input, ABS_Y, TRACKPAD2_RES_Y); input_abs_set_res(input, ABS_MT_POSITION_X, TRACKPAD2_RES_X); input_abs_set_res(input, ABS_MT_POSITION_Y, TRACKPAD2_RES_Y); } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ input_set_abs_params(input, ABS_MT_ORIENTATION, -31, 32, 1, 0); input_set_abs_params(input, ABS_X, TRACKPAD_MIN_X, TRACKPAD_MAX_X, 4, 0); input_set_abs_params(input, ABS_Y, TRACKPAD_MIN_Y, TRACKPAD_MAX_Y, 4, 0); input_set_abs_params(input, ABS_MT_POSITION_X, TRACKPAD_MIN_X, TRACKPAD_MAX_X, 4, 0); input_set_abs_params(input, ABS_MT_POSITION_Y, TRACKPAD_MIN_Y, TRACKPAD_MAX_Y, 4, 0); input_abs_set_res(input, ABS_X, TRACKPAD_RES_X); input_abs_set_res(input, ABS_Y, TRACKPAD_RES_Y); input_abs_set_res(input, ABS_MT_POSITION_X, TRACKPAD_RES_X); input_abs_set_res(input, ABS_MT_POSITION_Y, TRACKPAD_RES_Y); } input_set_events_per_packet(input, 60); if (report_undeciphered && input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 && input->id.product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { __set_bit(EV_MSC, input->evbit); __set_bit(MSC_RAW, input->mscbit); } /* * hid-input may mark device as using autorepeat, but neither * the trackpad, nor the mouse actually want it. */ __clear_bit(EV_REP, input->evbit); return 0; } static int magicmouse_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); if (!msc->input) msc->input = hi->input; /* Magic Trackpad does not give relative data after switching to MT */ if ((hi->input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD || hi->input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || hi->input->id.product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) && field->flags & HID_MAIN_ITEM_RELATIVE) return -1; return 0; } static int magicmouse_input_configured(struct hid_device *hdev, struct hid_input *hi) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); int ret; ret = magicmouse_setup_input(msc->input, hdev); if (ret) { hid_err(hdev, "magicmouse setup input failed (%d)\n", ret); /* clean msc->input to notify probe() of the failure */ msc->input = NULL; return ret; } return 0; } static int magicmouse_enable_multitouch(struct hid_device *hdev) { const u8 *feature; const u8 feature_mt[] = { 0xD7, 0x01 }; const u8 feature_mt_mouse2[] = { 0xF1, 0x02, 0x01 }; const u8 feature_mt_trackpad2_usb[] = { 0x02, 0x01 }; const u8 feature_mt_trackpad2_bt[] = { 0xF1, 0x02, 0x01 }; u8 *buf; int ret; int feature_size; if (hdev->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || hdev->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { if (hdev->vendor == BT_VENDOR_ID_APPLE) { feature_size = sizeof(feature_mt_trackpad2_bt); feature = feature_mt_trackpad2_bt; } else { /* USB_VENDOR_ID_APPLE */ feature_size = sizeof(feature_mt_trackpad2_usb); feature = feature_mt_trackpad2_usb; } } else if (hdev->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { feature_size = sizeof(feature_mt_mouse2); feature = feature_mt_mouse2; } else { feature_size = sizeof(feature_mt); feature = feature_mt; } buf = kmemdup(feature, feature_size, GFP_KERNEL); if (!buf) return -ENOMEM; ret = hid_hw_raw_request(hdev, buf[0], buf, feature_size, HID_FEATURE_REPORT, HID_REQ_SET_REPORT); kfree(buf); return ret; } static void magicmouse_enable_mt_work(struct work_struct *work) { struct magicmouse_sc *msc = container_of(work, struct magicmouse_sc, work.work); int ret; ret = magicmouse_enable_multitouch(msc->hdev); if (ret < 0) hid_err(msc->hdev, "unable to request touch data (%d)\n", ret); } static int magicmouse_fetch_battery(struct hid_device *hdev) { #ifdef CONFIG_HID_BATTERY_STRENGTH struct hid_report_enum *report_enum; struct hid_report *report; if (!hdev->battery || hdev->vendor != USB_VENDOR_ID_APPLE || (hdev->product != USB_DEVICE_ID_APPLE_MAGICMOUSE2 && hdev->product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 && hdev->product != USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC)) return -1; report_enum = &hdev->report_enum[hdev->battery_report_type]; report = report_enum->report_id_hash[hdev->battery_report_id]; if (!report || report->maxfield < 1) return -1; if (hdev->battery_capacity == hdev->battery_max) return -1; hid_hw_request(hdev, report, HID_REQ_GET_REPORT); return 0; #else return -1; #endif } static void magicmouse_battery_timer_tick(struct timer_list *t) { struct magicmouse_sc *msc = from_timer(msc, t, battery_timer); struct hid_device *hdev = msc->hdev; if (magicmouse_fetch_battery(hdev) == 0) { mod_timer(&msc->battery_timer, jiffies + msecs_to_jiffies(USB_BATTERY_TIMEOUT_MS)); } } static int magicmouse_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct magicmouse_sc *msc; struct hid_report *report; int ret; msc = devm_kzalloc(&hdev->dev, sizeof(*msc), GFP_KERNEL); if (msc == NULL) { hid_err(hdev, "can't alloc magicmouse descriptor\n"); return -ENOMEM; } msc->scroll_accel = SCROLL_ACCEL_DEFAULT; msc->hdev = hdev; INIT_DEFERRABLE_WORK(&msc->work, magicmouse_enable_mt_work); msc->quirks = id->driver_data; hid_set_drvdata(hdev, msc); ret = hid_parse(hdev); if (ret) { hid_err(hdev, "magicmouse hid parse failed\n"); return ret; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "magicmouse hw start failed\n"); return ret; } timer_setup(&msc->battery_timer, magicmouse_battery_timer_tick, 0); mod_timer(&msc->battery_timer, jiffies + msecs_to_jiffies(USB_BATTERY_TIMEOUT_MS)); magicmouse_fetch_battery(hdev); if (id->vendor == USB_VENDOR_ID_APPLE && (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2 || ((id->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || id->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) && hdev->type != HID_TYPE_USBMOUSE))) return 0; if (!msc->input) { hid_err(hdev, "magicmouse input not registered\n"); ret = -ENOMEM; goto err_stop_hw; } if (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE) report = hid_register_report(hdev, HID_INPUT_REPORT, MOUSE_REPORT_ID, 0); else if (id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) report = hid_register_report(hdev, HID_INPUT_REPORT, MOUSE2_REPORT_ID, 0); else if (id->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || id->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) { if (id->vendor == BT_VENDOR_ID_APPLE) report = hid_register_report(hdev, HID_INPUT_REPORT, TRACKPAD2_BT_REPORT_ID, 0); else /* USB_VENDOR_ID_APPLE */ report = hid_register_report(hdev, HID_INPUT_REPORT, TRACKPAD2_USB_REPORT_ID, 0); } else { /* USB_DEVICE_ID_APPLE_MAGICTRACKPAD */ report = hid_register_report(hdev, HID_INPUT_REPORT, TRACKPAD_REPORT_ID, 0); report = hid_register_report(hdev, HID_INPUT_REPORT, DOUBLE_REPORT_ID, 0); } if (!report) { hid_err(hdev, "unable to register touch report\n"); ret = -ENOMEM; goto err_stop_hw; } report->size = 6; /* * Some devices repond with 'invalid report id' when feature * report switching it into multitouch mode is sent to it. * * This results in -EIO from the _raw low-level transport callback, * but there seems to be no other way of switching the mode. * Thus the super-ugly hacky success check below. */ ret = magicmouse_enable_multitouch(hdev); if (ret != -EIO && ret < 0) { hid_err(hdev, "unable to request touch data (%d)\n", ret); goto err_stop_hw; } if (ret == -EIO && id->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2) { schedule_delayed_work(&msc->work, msecs_to_jiffies(500)); } return 0; err_stop_hw: del_timer_sync(&msc->battery_timer); hid_hw_stop(hdev); return ret; } static void magicmouse_remove(struct hid_device *hdev) { struct magicmouse_sc *msc = hid_get_drvdata(hdev); if (msc) { cancel_delayed_work_sync(&msc->work); del_timer_sync(&msc->battery_timer); } hid_hw_stop(hdev); } static const __u8 *magicmouse_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { /* * Change the usage from: * 0x06, 0x00, 0xff, // Usage Page (Vendor Defined Page 1) 0 * 0x09, 0x0b, // Usage (Vendor Usage 0x0b) 3 * To: * 0x05, 0x01, // Usage Page (Generic Desktop) 0 * 0x09, 0x02, // Usage (Mouse) 2 */ if (hdev->vendor == USB_VENDOR_ID_APPLE && (hdev->product == USB_DEVICE_ID_APPLE_MAGICMOUSE2 || hdev->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2 || hdev->product == USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC) && *rsize == 83 && rdesc[46] == 0x84 && rdesc[58] == 0x85) { hid_info(hdev, "fixing up magicmouse battery report descriptor\n"); *rsize = *rsize - 1; rdesc = kmemdup(rdesc + 1, *rsize, GFP_KERNEL); if (!rdesc) return NULL; rdesc[0] = 0x05; rdesc[1] = 0x01; rdesc[2] = 0x09; rdesc[3] = 0x02; } return rdesc; } static const struct hid_device_id magic_mice[] = { { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE), .driver_data = 0 }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE2), .driver_data = 0 }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE2), .driver_data = 0 }, { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD), .driver_data = 0 }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2), .driver_data = 0 }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2), .driver_data = 0 }, { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC), .driver_data = 0 }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2_USBC), .driver_data = 0 }, { } }; MODULE_DEVICE_TABLE(hid, magic_mice); static struct hid_driver magicmouse_driver = { .name = "magicmouse", .id_table = magic_mice, .probe = magicmouse_probe, .remove = magicmouse_remove, .report_fixup = magicmouse_report_fixup, .raw_event = magicmouse_raw_event, .event = magicmouse_event, .input_mapping = magicmouse_input_mapping, .input_configured = magicmouse_input_configured, }; module_hid_driver(magicmouse_driver); MODULE_DESCRIPTION("Apple \"Magic\" Wireless Mouse driver"); MODULE_LICENSE("GPL");
188 15 15 6 15 9 27 27 4 11 11 14 10 5 1 12 5 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 // SPDX-License-Identifier: GPL-2.0-only #include <linux/interval_tree.h> #include <linux/interval_tree_generic.h> #include <linux/compiler.h> #include <linux/export.h> #define START(node) ((node)->start) #define LAST(node) ((node)->last) INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long, __subtree_last, START, LAST,, interval_tree) EXPORT_SYMBOL_GPL(interval_tree_insert); EXPORT_SYMBOL_GPL(interval_tree_remove); EXPORT_SYMBOL_GPL(interval_tree_iter_first); EXPORT_SYMBOL_GPL(interval_tree_iter_next); #ifdef CONFIG_INTERVAL_TREE_SPAN_ITER /* * Roll nodes[1] into nodes[0] by advancing nodes[1] to the end of a contiguous * span of nodes. This makes nodes[0]->last the end of that contiguous used span * indexes that started at the original nodes[1]->start. nodes[1] is now the * first node starting the next used span. A hole span is between nodes[0]->last * and nodes[1]->start. nodes[1] must be !NULL. */ static void interval_tree_span_iter_next_gap(struct interval_tree_span_iter *state) { struct interval_tree_node *cur = state->nodes[1]; state->nodes[0] = cur; do { if (cur->last > state->nodes[0]->last) state->nodes[0] = cur; cur = interval_tree_iter_next(cur, state->first_index, state->last_index); } while (cur && (state->nodes[0]->last >= cur->start || state->nodes[0]->last + 1 == cur->start)); state->nodes[1] = cur; } void interval_tree_span_iter_first(struct interval_tree_span_iter *iter, struct rb_root_cached *itree, unsigned long first_index, unsigned long last_index) { iter->first_index = first_index; iter->last_index = last_index; iter->nodes[0] = NULL; iter->nodes[1] = interval_tree_iter_first(itree, first_index, last_index); if (!iter->nodes[1]) { /* No nodes intersect the span, whole span is hole */ iter->start_hole = first_index; iter->last_hole = last_index; iter->is_hole = 1; return; } if (iter->nodes[1]->start > first_index) { /* Leading hole on first iteration */ iter->start_hole = first_index; iter->last_hole = iter->nodes[1]->start - 1; iter->is_hole = 1; interval_tree_span_iter_next_gap(iter); return; } /* Starting inside a used */ iter->start_used = first_index; iter->is_hole = 0; interval_tree_span_iter_next_gap(iter); iter->last_used = iter->nodes[0]->last; if (iter->last_used >= last_index) { iter->last_used = last_index; iter->nodes[0] = NULL; iter->nodes[1] = NULL; } } EXPORT_SYMBOL_GPL(interval_tree_span_iter_first); void interval_tree_span_iter_next(struct interval_tree_span_iter *iter) { if (!iter->nodes[0] && !iter->nodes[1]) { iter->is_hole = -1; return; } if (iter->is_hole) { iter->start_used = iter->last_hole + 1; iter->last_used = iter->nodes[0]->last; if (iter->last_used >= iter->last_index) { iter->last_used = iter->last_index; iter->nodes[0] = NULL; iter->nodes[1] = NULL; } iter->is_hole = 0; return; } if (!iter->nodes[1]) { /* Trailing hole */ iter->start_hole = iter->nodes[0]->last + 1; iter->last_hole = iter->last_index; iter->nodes[0] = NULL; iter->is_hole = 1; return; } /* must have both nodes[0] and [1], interior hole */ iter->start_hole = iter->nodes[0]->last + 1; iter->last_hole = iter->nodes[1]->start - 1; iter->is_hole = 1; interval_tree_span_iter_next_gap(iter); } EXPORT_SYMBOL_GPL(interval_tree_span_iter_next); /* * Advance the iterator index to a specific position. The returned used/hole is * updated to start at new_index. This is faster than calling * interval_tree_span_iter_first() as it can avoid full searches in several * cases where the iterator is already set. */ void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter, struct rb_root_cached *itree, unsigned long new_index) { if (iter->is_hole == -1) return; iter->first_index = new_index; if (new_index > iter->last_index) { iter->is_hole = -1; return; } /* Rely on the union aliasing hole/used */ if (iter->start_hole <= new_index && new_index <= iter->last_hole) { iter->start_hole = new_index; return; } if (new_index == iter->last_hole + 1) interval_tree_span_iter_next(iter); else interval_tree_span_iter_first(iter, itree, new_index, iter->last_index); } EXPORT_SYMBOL_GPL(interval_tree_span_iter_advance); #endif
931 934 2 789 321 126 1 105 105 317 293 28 906 907 905 780 556 756 907 910 33 948 33 140 2 892 944 943 1 948 1 944 941 1 915 7 537 537 525 16 519 20 521 14 3 13 527 9 536 536 269 535 536 536 533 539 59 59 58 110 110 109 109 110 33 33 33 33 33 33 33 32 33 31 25 77 76 77 77 72 77 141 141 8 137 137 114 14 4 117 12 110 57 53 4 49 27 16 41 35 31 33 31 16 74 5 58 1 170 113 1 58 51 13 1065 657 660 655 2 584 167 1027 801 213 917 971 347 13 39 969 1 52 164 865 210 846 43 169 170 170 123 115 6 60 47 103 432 442 443 386 60 60 60 60 59 60 443 443 438 464 50 441 465 468 2 1 459 6 462 7 386 7 70 51 1 32 4 17 2 10 8 407 8 444 8 455 436 103 52 441 440 49 407 4 199 309 437 435 4 432 266 431 400 42 304 197 68 12 10 2 60 60 59 59 54 5 281 217 25 3 386 382 15 50 347 343 371 469 421 440 50 417 3 407 9 416 93 413 416 406 13 53 413 374 415 416 416 291 13 175 14 190 108 392 17 321 322 2 318 78 23 20 58 58 58 110 110 7 1 101 20 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 output functions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_output.c * * Changes: * A.N.Kuznetsov : airthmetics in fragmentation. * extension headers are implemented. * route changes now work. * ip6_forward does not confuse sniffers. * etc. * * H. von Brand : Added missing #include <linux/string.h> * Imran Patel : frag id should be in NBO * Kazunori MIYAZAWA @USAGI * : add ip6_append_data and related functions * for datagram xmit */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/route.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/bpf-cgroup.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/sock.h> #include <net/snmp.h> #include <net/gso.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/rawv6.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/checksum.h> #include <linux/mroute6.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; struct ipv6hdr *hdr; struct neighbour *neigh; int ret; /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { /* Make sure idev stays alive */ rcu_read_lock(); skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return -ENOMEM; } rcu_read_unlock(); } hdr = ipv6_hdr(skb); daddr = &hdr->daddr; if (ipv6_addr_is_multicast(daddr)) { if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); /* Do not check for IFF_ALLMULTI; multicast routing is not supported in any case. */ if (newskb) NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); if (hdr->hop_limit == 0) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return 0; } } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && !(dev->flags & IFF_LOOPBACK)) { kfree_skb(skb); return 0; } } if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); if (res != LWTUNNEL_XMIT_CONTINUE) return res; } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); if (IS_ERR_OR_NULL(neigh)) { if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { rcu_read_unlock(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; } } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } static int ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { struct sk_buff *segs, *nskb; netdev_features_t features; int ret = 0; /* Please see corresponding comment in ip_finish_output_gso * describing the cases where GSO segment length exceeds the * egress MTU. */ features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } consume_skb(skb); skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); /* Last GSO segment can be smaller than gso_size (and MTU). * Adding a fragment header would produce an "atomic fragment", * which is considered harmful (RFC-8021). Avoid that. */ err = segs->len > mtu ? ip6_fragment(net, sk, segs, ip6_finish_output2) : ip6_finish_output2(net, sk, segs); if (err && ret == 0) ret = err; } return ret; } static int ip6_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && !skb_gso_validate_network_len(skb, mtu)) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); return ip6_finish_output2(net, sk, skb); } static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IP6CB(skb)->flags |= IP6SKB_REROUTED; return dst_output(net, sk, skb); } #endif mtu = ip6_skb_dst_mtu(skb); if (skb_is_gso(skb)) return ip6_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) return ip6_fragment(net, sk, skb, ip6_finish_output2); return ip6_finish_output2(net, sk, skb); } static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_SUCCESS: case NET_XMIT_CN: return __ip6_finish_output(net, sk, skb) ? : ret; default: kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, indev, dev, ip6_finish_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } EXPORT_SYMBOL(ip6_output); bool ip6_autoflowlabel(struct net *net, const struct sock *sk) { if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) return ip6_default_np_autolabel(net); return inet6_test_bit(AUTOFLOWLABEL, sk); } /* * xmit an sk_buff (used by TCP, SCTP and DCCP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; int hlimit = -1; u32 mtu; head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { /* Make sure idev stays alive */ rcu_read_lock(); skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return -ENOBUFS; } rcu_read_unlock(); } if (opt) { seg_len += opt->opt_nflen + opt->opt_flen; if (opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, &fl6->saddr); } if (unlikely(seg_len > IPV6_MAXPLEN)) { hop_jumbo = skb_push(skb, hoplen); hop_jumbo->nexthdr = proto; hop_jumbo->hdrlen = 0; hop_jumbo->tlv_type = IPV6_TLV_JUMBO; hop_jumbo->tlv_len = 4; hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); proto = IPPROTO_HOPOPTS; seg_len = 0; IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); /* * Fill in the IPv6 header */ if (np) hlimit = READ_ONCE(np->hop_limit); if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; hdr->hop_limit = hlimit; hdr->saddr = fl6->saddr; hdr->daddr = *first_hop; skb->protocol = htons(ETH_P_IPV6); skb->priority = priority; skb->mark = mark; mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); if (unlikely(!skb)) return 0; /* hooks should never assume socket lock is held. * we promote our socket to non const */ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, (struct sock *)sk, skb, NULL, dev, dst_output); } skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const */ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } EXPORT_SYMBOL(ip6_xmit); static int ip6_call_ra_chain(struct sk_buff *skb, int sel) { struct ip6_ra_chain *ra; struct sock *last = NULL; read_lock(&ip6_ra_lock); for (ra = ip6_ra_chain; ra; ra = ra->next) { struct sock *sk = ra->sk; if (sk && ra->sel == sel && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { if (inet6_test_bit(RTALERT_ISOLATE, sk) && !net_eq(sock_net(sk), dev_net(skb->dev))) { continue; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) rawv6_rcv(last, skb2); } last = sk; } } if (last) { rawv6_rcv(last, skb); read_unlock(&ip6_ra_lock); return 1; } read_unlock(&ip6_ra_lock); return 0; } static int ip6_forward_proxy_check(struct sk_buff *skb) { struct ipv6hdr *hdr = ipv6_hdr(skb); u8 nexthdr = hdr->nexthdr; __be16 frag_off; int offset; if (ipv6_ext_hdr(nexthdr)) { offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); if (offset < 0) return 0; } else offset = sizeof(struct ipv6hdr); if (nexthdr == IPPROTO_ICMPV6) { struct icmp6hdr *icmp6; if (!pskb_may_pull(skb, (skb_network_header(skb) + offset + 1 - skb->data))) return 0; icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (icmp6->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: /* For reaction involving unicast neighbor discovery * message destined to the proxied address, pass it to * input function. */ return 1; default: break; } } /* * The proxying router can't forward traffic sent to a link-local * address, so signal the sender and discard the packet. This * behavior is clarified by the MIPv6 specification. */ if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { dst_link_failure(skb); return -1; } return 0; } static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_NET_SWITCHDEV if (skb->offload_l3_fwd_mark) { consume_skb(skb); return 0; } #endif skb_clear_tstamp(skb); return dst_output(net, sk, skb); } static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) return true; if (skb->ignore_df) return false; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; } int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); struct inet6_dev *idev; SKB_DR(reason); u32 mtu; idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) goto error; if (skb->pkt_type != PACKET_HOST) goto drop; if (unlikely(skb->sk)) goto drop; if (skb_warn_if_lro(skb)) goto drop; if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && (!idev || !READ_ONCE(idev->cnf.disable_policy)) && !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } skb_forward_csum(skb); /* * We DO NOT make any processing on * RA packets, pushing them to user level AS IS * without ane WARRANTY that application will be able * to interpret them. The reason is that we * cannot make anything clever here. * * We are not end-node, so that if packet contains * AH/ESP, we cannot make anything. * Defragmentation also would be mistake, RA packets * cannot be fragmented, because there is no warranty * that different fragments will go along one path. --ANK */ if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { if (ip6_call_ra_chain(skb, ntohs(opt->ra))) return 0; } /* * check and decrement ttl */ if (hdr->hop_limit <= 1) { icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return -ETIMEDOUT; } /* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit * here by 1, as we do at the end of the * function too. * * But that would be incorrect, as proxying is * not forwarding. The ip6_input function * will handle this packet locally, and it * depends on the hop limit being unchanged. * * One example is the NDP hop limit, that * always has to stay 255, but other would be * similar checks around RA packets, where the * user can even change the desired limit. */ return ip6_input(skb); } else if (proxied < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); SKB_DR_SET(reason, XFRM_POLICY); goto drop; } dst = skb_dst(skb); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ if (IP6CB(skb)->iif == dst->dev->ifindex && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; struct rt6_info *rt; /* * incoming and outgoing devices are the same * send a redirect. */ rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) target = &rt->rt6i_gateway; else target = &hdr->daddr; rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); rcu_read_unlock(); } else { int addrtype = ipv6_addr_type(&hdr->saddr); /* This check is security critical. */ if (addrtype == IPV6_ADDR_ANY || addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) goto error; if (addrtype & IPV6_ADDR_LINKLOCAL) { icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOT_NEIGHBOUR, 0); goto error; } } __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); mtu = ip6_dst_mtu_maybe_forward(dst, true); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; } hdr = ipv6_hdr(skb); /* Mangling hops number delayed to point after skb COW */ hdr->hop_limit--; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); error: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); SKB_DR_SET(reason, IP_INADDRERRORS); drop: kfree_skb_reason(skb, reason); return -EINVAL; } static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; skb_dst_drop(to); skb_dst_set(to, dst_clone(skb_dst(from))); to->dev = from->dev; to->mark = from->mark; skb_copy_hash(to, from); #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif nf_copy(to, from); skb_ext_copy(to, from); skb_copy_secmark(to, from); } int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter) { unsigned int first_len; struct frag_hdr *fh; /* BUILD HEADER */ *prevhdr = NEXTHDR_FRAGMENT; iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!iter->tmp_hdr) return -ENOMEM; iter->frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); iter->offset = 0; iter->hlen = hlen; iter->frag_id = frag_id; iter->nexthdr = nexthdr; __skb_pull(skb, hlen); fh = __skb_push(skb, sizeof(struct frag_hdr)); __skb_push(skb, hlen); skb_reset_network_header(skb); memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); fh->nexthdr = nexthdr; fh->reserved = 0; fh->frag_off = htons(IP6_MF); fh->identification = frag_id; first_len = skb_pagelen(skb); skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); return 0; } EXPORT_SYMBOL(ip6_fraglist_init); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter) { struct sk_buff *frag = iter->frag; unsigned int hlen = iter->hlen; struct frag_hdr *fh; frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); fh = __skb_push(frag, sizeof(struct frag_hdr)); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); iter->offset += skb->len - hlen - sizeof(struct frag_hdr); fh->nexthdr = iter->nexthdr; fh->reserved = 0; fh->frag_off = htons(iter->offset); if (frag->next) fh->frag_off |= htons(IP6_MF); fh->identification = iter->frag_id; ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); ip6_copy_metadata(frag, skb); } EXPORT_SYMBOL(ip6_fraglist_prepare); void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) { state->prevhdr = prevhdr; state->nexthdr = nexthdr; state->frag_id = frag_id; state->hlen = hlen; state->mtu = mtu; state->left = skb->len - hlen; /* Space per frame */ state->ptr = hlen; /* Where to start from */ state->hroom = hdr_room; state->troom = needed_tailroom; state->offset = 0; } EXPORT_SYMBOL(ip6_frag_init); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) { u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; struct sk_buff *frag; struct frag_hdr *fh; unsigned int len; len = state->left; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu) len = state->mtu; /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < state->left) len &= ~7; /* Allocate buffer */ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + state->hroom + state->troom, GFP_ATOMIC); if (!frag) return ERR_PTR(-ENOMEM); /* * Set up data on packet */ ip6_copy_metadata(frag, skb); skb_reserve(frag, state->hroom); skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); skb_reset_network_header(frag); fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); frag->transport_header = (frag->network_header + state->hlen + sizeof(struct frag_hdr)); /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) skb_set_owner_w(frag, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); fragnexthdr_offset = skb_network_header(frag); fragnexthdr_offset += prevhdr - skb_network_header(skb); *fragnexthdr_offset = NEXTHDR_FRAGMENT; /* * Build fragment header. */ fh->nexthdr = state->nexthdr; fh->reserved = 0; fh->identification = state->frag_id; /* * Copy a block of the IP datagram. */ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), len)); state->left -= len; fh->frag_off = htons(state->offset); if (state->left > 0) fh->frag_off |= htons(IP6_MF); ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); state->ptr += len; state->offset += len; return frag; } EXPORT_SYMBOL(ip6_frag_next); int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; u8 tstamp_type = skb->tstamp_type; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; int hroom, err = 0; __be32 frag_id; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) goto fail; hlen = err; nexthdr = *prevhdr; nexthdr_offset = prevhdr - skb_network_header(skb); mtu = ip6_skb_dst_mtu(skb); /* We must not fragment if the socket is set to force MTU discovery * or if the skb it not generated by a local socket. */ if (unlikely(!skb->ignore_df && skb->len > mtu)) goto fail_toobig; if (IP6CB(skb)->frag_max_size) { if (IP6CB(skb)->frag_max_size > mtu) goto fail_toobig; /* don't send fragments larger than what we received */ mtu = IP6CB(skb)->frag_max_size; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; } if (np) { u32 frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) mtu = frag_size; } if (mtu < hlen + sizeof(struct frag_hdr) + 8) goto fail_toobig; mtu -= hlen + sizeof(struct frag_hdr); frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr); if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto fail; prevhdr = skb_network_header(skb) + nexthdr_offset; hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || ((first_len - hlen) & 7) || skb_cloned(skb) || skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } skb->truesize -= frag->truesize; } err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, &iter); if (err < 0) goto fail; /* We prevent @rt from being freed. */ rcu_read_lock(); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) ip6_fraglist_prepare(skb, &iter); skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); if (err || !iter.frag) break; skb = ip6_fraglist_next(&iter); } kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGOKS); rcu_read_unlock(); return 0; } kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); rcu_read_unlock(); return err; slow_path_clean: skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } } slow_path: /* * Fragment the datagram. */ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, &state); /* * Keep copying data until we run out. */ while (state.left > 0) { frag = ip6_frag_next(skb, &state); if (IS_ERR(frag)) { err = PTR_ERR(frag); goto fail; } /* * Put this fragment into the sending queue. */ skb_set_delivery_time(frag, tstamp, tstamp_type); err = output(net, sk, frag); if (err) goto fail; IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGCREATES); } IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGOKS); consume_skb(skb); return err; fail_toobig: icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); err = -EMSGSIZE; fail: IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return err; } static inline int ip6_rt_check(const struct rt6key *rt_key, const struct in6_addr *fl_addr, const struct in6_addr *addr_cache) { return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); } static struct dst_entry *ip6_sk_dst_check(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); struct rt6_info *rt; if (!dst) goto out; if (dst->ops->family != AF_INET6) { dst_release(dst); return NULL; } rt = dst_rt6_info(dst); /* Yes, checking route validity in not connected * case is not very simple. Take into account, * that we do not support routing by source, TOS, * and MSG_DONTROUTE --ANK (980726) * * 1. ip6_rt_check(): If route was host route, * check that cached destination is current. * If it is network route, we still may * check its validity using saved pointer * to the last used address: daddr_cache. * We do not want to save whole address now, * (because main consumer of this service * is tcp, which has not this problem), * so that the last trick works only on connected * sockets. * 2. oif also should be the same. */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { dst_release(dst); dst = NULL; } out: return dst; } static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD struct neighbour *n; struct rt6_info *rt; #endif int err; int flags = 0; /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the * ip6_route_output call _before_ ip6_route_get_saddr. * * In source specific routing (no src=any default route), * ip6_route_output will fail given src=any saddr, though, so * that's why we try it again later. */ if (ipv6_addr_any(&fl6->saddr)) { struct fib6_info *from; struct rt6_info *rt; *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : dst_rt6_info(*dst); rcu_read_lock(); from = rt ? rcu_dereference(rt->from) : NULL; err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, fl6->flowi6_l3mdev, &fl6->saddr); rcu_read_unlock(); if (err) goto out_err_release; /* If we had an erroneous initial result, pretend it * never existed and let the SA-enabled version take * over. */ if ((*dst)->error) { dst_release(*dst); *dst = NULL; } if (fl6->flowi6_oif) flags |= RT6_LOOKUP_F_IFACE; } if (!*dst) *dst = ip6_route_output_flags(net, sk, fl6, flags); err = (*dst)->error; if (err) goto out_err_release; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * Here if the dst entry we've looked up * has a neighbour entry that is in the INCOMPLETE * state and the src address from the flow is * marked as OPTIMISTIC, we release the found * dst entry and replace it instead with the * dst entry of the nexthop router */ rt = dst_rt6_info(*dst); rcu_read_lock(); n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr)); err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; rcu_read_unlock(); if (err) { struct inet6_ifaddr *ifp; struct flowi6 fl_gw6; int redirect; ifp = ipv6_get_ifaddr(net, &fl6->saddr, (*dst)->dev, 1); redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); if (ifp) in6_ifa_put(ifp); if (redirect) { /* * We need to get the dst entry for the * default router instead */ dst_release(*dst); memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); *dst = ip6_route_output(net, sk, &fl_gw6); err = (*dst)->error; if (err) goto out_err_release; } } #endif if (ipv6_addr_v4mapped(&fl6->saddr) && !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { err = -EAFNOSUPPORT; goto out_err_release; } return 0; out_err_release: dst_release(*dst); *dst = NULL; if (err == -ENETUNREACH) IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; } /** * ip6_dst_lookup - perform route lookup on flow * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @dst: pointer to dst_entry * for result * @fl6: flow to lookup * * This function performs a route lookup on the given flow. * * It returns zero on success, or a standard errno code on error. */ int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { *dst = NULL; return ip6_dst_lookup_tail(net, sk, dst, fl6); } EXPORT_SYMBOL_GPL(ip6_dst_lookup); /** * ip6_dst_lookup_flow - perform route lookup on flow with ipsec * @net: Network namespace to perform lookup in * @sk: socket which provides route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * * This function performs a route lookup on the given flow. * * It returns a valid dst pointer on success, or a pointer encoded * error code. */ struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst) { struct dst_entry *dst = NULL; int err; err = ip6_dst_lookup_tail(net, sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); } EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); /** * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow * @sk: socket which provides the dst cache and route info * @fl6: flow to lookup * @final_dst: final destination address for ipsec lookup * @connected: whether @sk is connected or not * * This function performs a route lookup on the given flow with the * possibility of using the cached route in the socket if it is valid. * It will take the socket dst lock when operating on the dst cache. * As a result, this function can only be used in process context. * * In addition, for a connected socket, cache the dst in the socket * if the current cache is not valid. * * It returns a valid dst pointer on success, or a pointer encoded * error code. */ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected) { struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); dst = ip6_sk_dst_check(sk, dst, fl6); if (dst) return dst; dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); if (connected && !IS_ERR(dst)) ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); return dst; } EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, gfp_t gfp) { return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; } static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, gfp_t gfp) { return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; } static void ip6_append_data_mtu(unsigned int *mtu, int *maxfraglen, unsigned int fragheaderlen, struct sk_buff *skb, struct rt6_info *rt, unsigned int orig_mtu) { if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { if (!skb) { /* first fragment, reserve header_len */ *mtu = orig_mtu - rt->dst.header_len; } else { /* * this fragment is not first, the headers * space is regarded as data space. */ *mtu = orig_mtu; } *maxfraglen = ((*mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); } } static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, struct rt6_info *rt) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu, frag_size; struct ipv6_txoptions *nopt, *opt = ipc6->opt; /* callers pass dst together with a reference, set it first so * ip6_cork_release() can put it down even in case of an error. */ cork->base.dst = &rt->dst; /* * setup for corking */ if (opt) { if (WARN_ON(v6_cork->opt)) return -EINVAL; nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); if (unlikely(!nopt)) return -ENOBUFS; nopt->tot_len = sizeof(*opt); nopt->opt_flen = opt->opt_flen; nopt->opt_nflen = opt->opt_nflen; nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); if (opt->dst0opt && !nopt->dst0opt) return -ENOBUFS; nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); if (opt->dst1opt && !nopt->dst1opt) return -ENOBUFS; nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); if (opt->hopopt && !nopt->hopopt) return -ENOBUFS; nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); if (opt->srcrt && !nopt->srcrt) return -ENOBUFS; /* need source address above miyazawa*/ } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); else mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) mtu = frag_size; cork->base.fragsize = mtu; cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; cork->base.priority = ipc6->sockc.priority; sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->base.flags |= IPCORK_TS_OPT_ID; cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; } cork->base.length = 0; cork->base.transmit_time = ipc6->sockc.transmit_time; return 0; } static int __ip6_append_data(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork_full, struct inet6_cork *v6_cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; struct flowi6 *fl6 = &cork_full->fl.u.ip6; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; int hh_len; int copy; int err; int offset = 0; bool zc = false; u32 tskey = 0; struct rt6_info *rt = dst_rt6_info(cork->dst); bool paged, hold_tskey = false, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; skb = skb_peek_tail(queue); if (!skb) { exthdrlen = opt ? opt->opt_flen : 0; dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } paged = !!cork->gso_size; mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + (opt ? opt->opt_nflen : 0); headersize = sizeof(struct ipv6hdr) + (opt ? opt->opt_flen + opt->opt_nflen : 0) + rt->rt6i_nfheader_len; if (mtu <= fragheaderlen || ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) goto emsgsize; maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit * the first fragment */ if (headersize + transhdrlen > mtu) goto emsgsize; if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu - headersize + sizeof(struct ipv6hdr)); goto emsgsize; } if (ip6_sk_ignore_df(sk)) maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; else maxnonfragsize = mtu; if (cork->length + length > maxnonfragsize - headersize) { emsgsize: pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); return -EMSGSIZE; } /* CHECKSUM_PARTIAL only with no extension headers and when * we are not going to fragment */ if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && headersize == sizeof(struct ipv6hdr) && length <= mtu - headersize && (!(flags & MSG_MORE) || cork->gso_size) && rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) return -EINVAL; /* Leave uarg NULL if can't zerocopy, callers should * be able to handle it. */ if ((rt->dst.dev->features & NETIF_F_SG) && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; } else { uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } } else if ((flags & MSG_SPLICE_PAGES) && length) { if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) /* We need an empty buffer to attach stuff to */ paged = true; else flags &= ~MSG_SPLICE_PAGES; } if (cork->tx_flags & SKBTX_ANY_TSTAMP && READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { if (cork->flags & IPCORK_TS_OPT_ID) { tskey = cork->ts_opt_id; } else { tskey = atomic_inc_return(&sk->sk_tskey) - 1; hold_tskey = true; } } /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. * Otherwise, we need to reserve fragment header and * fragment alignment (= 8-15 octects, in total). * * Note that we may need to "move" the data from the tail * of the buffer to the new fragment when we split * the message. * * FIXME: It may be fragmented into multiple chunks * at once if non-fragmentable extension headers * are too large. * --yoshfuji */ cork->length += length; if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen, alloc_extra; unsigned int pagedlen; alloc_new_skb: /* There's no room in the current skb */ if (skb) fraggap = skb->len - maxfraglen; else fraggap = 0; /* update mtu and maxfraglen if necessary */ if (!skb || !skb_prev) ip6_append_data_mtu(&mtu, &maxfraglen, fragheaderlen, skb, rt, orig_mtu); skb_prev = skb; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; fraglen = datalen + fragheaderlen; pagedlen = 0; alloc_extra = hh_len; alloc_extra += dst_exthdrlen; alloc_extra += rt->dst.trailer_len; /* We just reserve space for fragment header. * Note: this may be overallocation if the message * (without MSG_MORE) fits into the MTU. */ alloc_extra += sizeof(struct frag_hdr); if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else if (!paged && (fraglen + alloc_extra < SKB_MAX_ALLOC || !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = fragheaderlen + transhdrlen; pagedlen = datalen - transhdrlen; } alloclen += alloc_extra; if (datalen != length + fraggap) { /* * this is not the last fragment, the trailer * space is regarded as data space. */ datalen += rt->dst.trailer_len; } fraglen = datalen + fragheaderlen; copy = datalen - transhdrlen - fraggap - pagedlen; /* [!] NOTE: copy may be negative if pagedlen>0 * because then the equation may reduces to -fraggap. */ if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { err = -EINVAL; goto error; } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } if (!skb) goto error; /* * Fill in the control structures */ skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = csummode; skb->csum = 0; /* reserve for fragmentation and ipsec header */ skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); /* * Find where to start putting bytes */ data = skb_put(skb, fraglen - pagedlen); skb_set_network_header(skb, exthdrlen); data += fragheaderlen; skb->transport_header = (skb->network_header + fragheaderlen); if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } else if (flags & MSG_SPLICE_PAGES) { copy = 0; } offset += copy; length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; /* Only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; skb_zcopy_set(skb, uarg, &extra_uref); if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); /* * Put the packet on the pending queue */ if (!skb->destructor) { skb->destructor = sock_wfree; skb->sk = sk; wmem_alloc_delta += skb->truesize; } __skb_queue_tail(queue, skb); continue; } if (copy > length) copy = length; if (!(rt->dst.dev->features&NETIF_F_SG) && skb_tailroom(skb) >= copy) { unsigned int off; off = skb->len; if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; err = -EIO; if (WARN_ON_ONCE(copy > msg->msg_iter.count)) goto error; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) goto error; copy = err; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; skb_zcopy_downgrade_managed(skb); if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb->len += copy; skb->data_len += copy; skb->truesize += copy; wmem_alloc_delta += copy; } else { err = skb_zerocopy_iter_dgram(skb, from, copy); if (err < 0) goto error; } offset += copy; length -= copy; } if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: err = -EFAULT; error: net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); if (hold_tskey) atomic_dec(&sk->sk_tskey); return err; } int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); int exthdrlen; int err; if (flags&MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) { /* * setup for corking */ dst_hold(&rt->dst); err = ip6_setup_cork(sk, &inet->cork, &np->cork, ipc6, rt); if (err) return err; inet->cork.fl.u.ip6 = *fl6; exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; } else { transhdrlen = 0; } return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) { struct dst_entry *dst = cork->base.dst; cork->base.dst = NULL; skb_dst_set(skb, dst); } static void ip6_cork_release(struct inet_cork_full *cork, struct inet6_cork *v6_cork) { if (v6_cork->opt) { struct ipv6_txoptions *opt = v6_cork->opt; kfree(opt->dst0opt); kfree(opt->dst1opt); kfree(opt->hopopt); kfree(opt->srcrt); kfree(opt); v6_cork->opt = NULL; } if (cork->base.dst) { dst_release(cork->base.dst); cork->base.dst = NULL; } } struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct in6_addr *final_dst; struct net *net = sock_net(sk); struct ipv6hdr *hdr; struct ipv6_txoptions *opt = v6_cork->opt; struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; skb = __skb_dequeue(queue); if (!skb) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Allow local fragmentation. */ skb->ignore_df = ip6_sk_ignore_df(sk); __skb_pull(skb, skb_network_header_len(skb)); final_dst = &fl6->daddr; if (opt && opt->opt_flen) ipv6_push_frag_opts(skb, opt, &proto); if (opt && opt->opt_nflen) ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; skb->priority = cork->base.priority; skb->mark = cork->base.mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); else skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); ip6_cork_steal_dst(skb, cork); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; if (sk->sk_socket->type == SOCK_RAW && !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } ip6_cork_release(cork, v6_cork); out: return skb; } int ip6_send_skb(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; rcu_read_lock(); err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); } rcu_read_unlock(); return err; } int ip6_push_pending_frames(struct sock *sk) { struct sk_buff *skb; skb = ip6_finish_skb(sk); if (!skb) return 0; return ip6_send_skb(skb); } EXPORT_SYMBOL_GPL(ip6_push_pending_frames); static void __ip6_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork) { struct sk_buff *skb; while ((skb = __skb_dequeue_tail(queue)) != NULL) { if (skb_dst(skb)) IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); } ip6_cork_release(cork, v6_cork); } void ip6_flush_pending_frames(struct sock *sk) { __ip6_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork) { struct inet6_cork v6_cork; struct sk_buff_head queue; int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); int err; if (flags & MSG_PROBE) { dst_release(&rt->dst); return NULL; } __skb_queue_head_init(&queue); cork->base.flags = 0; cork->base.addr = 0; cork->base.opt = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); if (err) { ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, &current->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); } return __ip6_make_skb(sk, &queue, cork, &v6_cork); }
4 15 2 13 3 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/environ.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" /** * tomoyo_check_env_acl - Check permission for environment variable's name. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_env_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_env_acl *acl = container_of(ptr, typeof(*acl), head); return tomoyo_path_matches_pattern(r->param.environ.name, acl->env); } /** * tomoyo_audit_env_log - Audit environment variable name log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_env_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "misc env %s\n", r->param.environ.name->name); } /** * tomoyo_env_perm - Check permission for environment variable's name. * * @r: Pointer to "struct tomoyo_request_info". * @env: The name of environment variable. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env) { struct tomoyo_path_info environ; int error; if (!env || !*env) return 0; environ.name = env; tomoyo_fill_path_info(&environ); r->param_type = TOMOYO_TYPE_ENV_ACL; r->param.environ.name = &environ; do { tomoyo_check_acl(r, tomoyo_check_env_acl); error = tomoyo_audit_env_log(r); } while (error == TOMOYO_RETRY_REQUEST); return error; } /** * tomoyo_same_env_acl - Check for duplicated "struct tomoyo_env_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_env_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_env_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_env_acl *p2 = container_of(b, typeof(*p2), head); return p1->env == p2->env; } /** * tomoyo_write_env - Write "struct tomoyo_env_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_write_env(struct tomoyo_acl_param *param) { struct tomoyo_env_acl e = { .head.type = TOMOYO_TYPE_ENV_ACL }; int error = -ENOMEM; const char *data = tomoyo_read_token(param); if (!tomoyo_correct_word(data) || strchr(data, '=')) return -EINVAL; e.env = tomoyo_get_name(data); if (!e.env) return error; error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_env_acl, NULL); tomoyo_put_name(e.env); return error; } /** * tomoyo_write_misc - Update environment variable list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ int tomoyo_write_misc(struct tomoyo_acl_param *param) { if (tomoyo_str_starts(&param->data, "env ")) return tomoyo_write_env(param); return -EINVAL; }
437 3616 620 676 380 70 6 16 941 940 1747 1589 1 3754 4065 3971 49 105 4 1046 418 1017 2900 579 3930 908 31 208 64 113 4 113 197 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/buffer_head.h * * Everything to do with buffer_heads. */ #ifndef _LINUX_BUFFER_HEAD_H #define _LINUX_BUFFER_HEAD_H #include <linux/types.h> #include <linux/blk_types.h> #include <linux/fs.h> #include <linux/linkage.h> #include <linux/pagemap.h> #include <linux/wait.h> #include <linux/atomic.h> enum bh_state_bits { BH_Uptodate, /* Contains valid data */ BH_Dirty, /* Is dirty */ BH_Lock, /* Is locked */ BH_Req, /* Has been submitted for I/O */ BH_Mapped, /* Has a disk mapping */ BH_New, /* Disk mapping was newly created by get_block */ BH_Async_Read, /* Is under end_buffer_async_read I/O */ BH_Async_Write, /* Is under end_buffer_async_write I/O */ BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ BH_Meta, /* Buffer contains metadata */ BH_Prio, /* Buffer should be submitted with REQ_PRIO */ BH_Defer_Completion, /* Defer AIO completion to workqueue */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities */ }; #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512) struct page; struct buffer_head; struct address_space; typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); /* * Historically, a buffer_head was used to map a single block * within a page, and of course as the unit of I/O through the * filesystem and block layers. Nowadays the basic I/O unit * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within * a folio (via a folio_mapping) and for wrapping bio submission * for backward compatibility reasons (e.g. submit_bh). */ struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ struct buffer_head *b_this_page;/* circular list of page's buffers */ union { struct page *b_page; /* the page this bh is mapped to */ struct folio *b_folio; /* the folio this bh is mapped to */ }; sector_t b_blocknr; /* start block number */ size_t b_size; /* size of mapping */ char *b_data; /* pointer to data within the page */ struct block_device *b_bdev; bh_end_io_t *b_end_io; /* I/O completion */ void *b_private; /* reserved for b_end_io */ struct list_head b_assoc_buffers; /* associated with another mapping */ struct address_space *b_assoc_map; /* mapping this buffer is associated with */ atomic_t b_count; /* users using this buffer_head */ spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to * serialise IO completion of other * buffers in the page */ }; /* * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() * and buffer_foo() functions. * To avoid reset buffer flags that are already set, because that causes * a costly cache line transition, check the flag first. */ #define BUFFER_FNS(bit, name) \ static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ if (!test_bit(BH_##bit, &(bh)->b_state)) \ set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ clear_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline int buffer_##name(const struct buffer_head *bh) \ { \ return test_bit(BH_##bit, &(bh)->b_state); \ } /* * test_set_buffer_foo() and test_clear_buffer_foo() */ #define TAS_BUFFER_FNS(bit, name) \ static __always_inline int test_set_buffer_##name(struct buffer_head *bh) \ { \ return test_and_set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \ { \ return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ } \ /* * Emit the buffer bitops functions. Note that there are also functions * of the form "mark_buffer_foo()". These are higher-level functions which * do something in addition to setting a b_state bit. */ BUFFER_FNS(Dirty, dirty) TAS_BUFFER_FNS(Dirty, dirty) BUFFER_FNS(Lock, locked) BUFFER_FNS(Req, req) TAS_BUFFER_FNS(Req, req) BUFFER_FNS(Mapped, mapped) BUFFER_FNS(New, new) BUFFER_FNS(Async_Read, async_read) BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) BUFFER_FNS(Meta, meta) BUFFER_FNS(Prio, prio) BUFFER_FNS(Defer_Completion, defer_completion) static __always_inline void set_buffer_uptodate(struct buffer_head *bh) { /* * If somebody else already set this uptodate, they will * have done the memory barrier, and a reader will thus * see *some* valid buffer state. * * Any other serialization (with IO errors or whatever that * might clear the bit) has to come from other state (eg BH_Lock). */ if (test_bit(BH_Uptodate, &bh->b_state)) return; /* * make it consistent with folio_mark_uptodate * pairs with smp_load_acquire in buffer_uptodate */ smp_mb__before_atomic(); set_bit(BH_Uptodate, &bh->b_state); } static __always_inline void clear_buffer_uptodate(struct buffer_head *bh) { clear_bit(BH_Uptodate, &bh->b_state); } static __always_inline int buffer_uptodate(const struct buffer_head *bh) { /* * make it consistent with folio_test_uptodate * pairs with smp_mb__before_atomic in set_buffer_uptodate */ return test_bit_acquire(BH_Uptodate, &bh->b_state); } static inline unsigned long bh_offset(const struct buffer_head *bh) { return (unsigned long)(bh)->b_data & (page_size(bh->b_page) - 1); } /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) #define page_has_buffers(page) PagePrivate(page) #define folio_buffers(folio) folio_get_private(folio) void buffer_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback); /* * Declarations */ void mark_buffer_dirty(struct buffer_head *bh); void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, gfp_t gfp); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size); struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, bool datasync); int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, bool datasync); void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len); static inline void clean_bdev_bh_alias(struct buffer_head *bh) { clean_bdev_aliases(bh->b_bdev, bh->b_blocknr, 1); } void mark_buffer_async_write(struct buffer_head *bh); void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); void __lock_buffer(struct buffer_head *bh); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags); void submit_bh(blk_opf_t, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait); void __bh_read_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags, bool force_lock); /* * Generic address_space_operations implementations for buffer_head-backed * address_spaces. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block); int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); int generic_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); void block_commit_write(struct page *page, unsigned int from, unsigned int to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); #ifdef CONFIG_MIGRATION extern int buffer_migrate_folio(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); extern int buffer_migrate_folio_norefs(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); #else #define buffer_migrate_folio NULL #define buffer_migrate_folio_norefs NULL #endif /* * inline definitions */ static inline void get_bh(struct buffer_head *bh) { atomic_inc(&bh->b_count); } static inline void put_bh(struct buffer_head *bh) { smp_mb__before_atomic(); atomic_dec(&bh->b_count); } /** * brelse - Release a buffer. * @bh: The buffer to release. * * Decrement a buffer_head's reference count. If @bh is NULL, this * function is a no-op. * * If all buffers on a folio have zero reference count, are clean * and unlocked, and if the folio is unlocked and not under writeback * then try_to_free_buffers() may strip the buffers from the folio in * preparation for freeing it (sometimes, rarely, buffers are removed * from a folio but it ends up not being freed, and buffers may later * be reattached). * * Context: Any context. */ static inline void brelse(struct buffer_head *bh) { if (bh) __brelse(bh); } /** * bforget - Discard any dirty data in a buffer. * @bh: The buffer to forget. * * Call this function instead of brelse() if the data written to a buffer * no longer needs to be written back. It will clear the buffer's dirty * flag so writeback of this buffer will be skipped. * * Context: Any context. */ static inline void bforget(struct buffer_head *bh) { if (bh) __bforget(bh); } static inline struct buffer_head * sb_bread(struct super_block *sb, sector_t block) { return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * sb_bread_unmovable(struct super_block *sb, sector_t block) { return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); } static inline void sb_breadahead(struct super_block *sb, sector_t block) { __breadahead(sb->s_bdev, block, sb->s_blocksize); } static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, sector_t block, unsigned size) { gfp_t gfp; gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS); gfp |= __GFP_NOFAIL; return bdev_getblk(bdev, block, size, gfp); } static inline struct buffer_head *__getblk(struct block_device *bdev, sector_t block, unsigned size) { gfp_t gfp; gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS); gfp |= __GFP_MOVABLE | __GFP_NOFAIL; return bdev_getblk(bdev, block, size, gfp); } static inline struct buffer_head *sb_getblk(struct super_block *sb, sector_t block) { return __getblk(sb->s_bdev, block, sb->s_blocksize); } static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) { return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp); } static inline struct buffer_head * sb_find_get_block(struct super_block *sb, sector_t block) { return __find_get_block(sb->s_bdev, block, sb->s_blocksize); } static inline void map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) { set_buffer_mapped(bh); bh->b_bdev = sb->s_bdev; bh->b_blocknr = block; bh->b_size = sb->s_blocksize; } static inline void wait_on_buffer(struct buffer_head *bh) { might_sleep(); if (buffer_locked(bh)) __wait_on_buffer(bh); } static inline int trylock_buffer(struct buffer_head *bh) { return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); } static inline void lock_buffer(struct buffer_head *bh) { might_sleep(); if (!trylock_buffer(bh)) __lock_buffer(bh); } static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags) { if (!buffer_uptodate(bh) && trylock_buffer(bh)) { if (!buffer_uptodate(bh)) __bh_read(bh, op_flags, false); else unlock_buffer(bh); } } static inline void bh_read_nowait(struct buffer_head *bh, blk_opf_t op_flags) { if (!bh_uptodate_or_lock(bh)) __bh_read(bh, op_flags, false); } /* Returns 1 if buffer uptodated, 0 on success, and -EIO on error. */ static inline int bh_read(struct buffer_head *bh, blk_opf_t op_flags) { if (bh_uptodate_or_lock(bh)) return 1; return __bh_read(bh, op_flags, true); } static inline void bh_read_batch(int nr, struct buffer_head *bhs[]) { __bh_read_batch(nr, bhs, 0, true); } static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[], blk_opf_t op_flags) { __bh_read_batch(nr, bhs, op_flags, false); } /** * __bread() - Read a block. * @bdev: The block device to read from. * @block: Block number in units of block size. * @size: The block size of this device in bytes. * * Read a specified block, and return the buffer head that refers * to it. The memory is allocated from the movable area so that it can * be migrated. The returned buffer head has its refcount increased. * The caller should call brelse() when it has finished with the buffer. * * Context: May sleep waiting for I/O. * Return: NULL if the block was unreadable. */ static inline struct buffer_head *__bread(struct block_device *bdev, sector_t block, unsigned size) { return __bread_gfp(bdev, block, size, __GFP_MOVABLE); } /** * get_nth_bh - Get a reference on the n'th buffer after this one. * @bh: The buffer to start counting from. * @count: How many buffers to skip. * * This is primarily useful for finding the nth buffer in a folio; in * that case you pass the head buffer and the byte offset in the folio * divided by the block size. It can be used for other purposes, but * it will wrap at the end of the folio rather than returning NULL or * proceeding to the next folio for you. * * Return: The requested buffer with an elevated refcount. */ static inline __must_check struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count) { while (count--) bh = bh->b_this_page; get_bh(bh); return bh; } bool block_dirty_folio(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_BUFFER_HEAD void buffer_init(void); bool try_to_free_buffers(struct folio *folio); int inode_has_buffers(struct inode *inode); void invalidate_inode_buffers(struct inode *inode); int remove_inode_buffers(struct inode *inode); int sync_mapping_buffers(struct address_space *mapping); void invalidate_bh_lrus(void); void invalidate_bh_lrus_cpu(void); bool has_bh_in_lru(int cpu, void *dummy); extern int buffer_heads_over_limit; #else /* CONFIG_BUFFER_HEAD */ static inline void buffer_init(void) {} static inline bool try_to_free_buffers(struct folio *folio) { return true; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline void invalidate_bh_lrus(void) {} static inline void invalidate_bh_lrus_cpu(void) {} static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BUFFER_HEAD */ #endif /* _LINUX_BUFFER_HEAD_H */
12 10 1 2 27 13 27 10 2 2 1 5 2 3 5 34 34 34 26 26 26 5 1 1 1 1 5 5 4 4 6 5 1 1 7 6 1 1 11 42 41 10 32 15 1 20 21 20 7 7 26 8 1 1 7 10 2 18 26 21 27 34 14 33 2 34 14 14 34 2 2 2 34 27 27 27 27 24 2 3 22 3 2 1 22 3 3 22 22 25 18 42 2158 109 2161 15 1 1 8 12 2 8 3 3 292 3 18 18 34 34 35 36 30 295 295 1 299 295 298 295 296 296 293 294 294 293 296 296 293 296 295 100 28 7 57 5 1 6 5 1 6 7 1 6 7 1 2 4 6 6 5 6 1 6 1 34 34 19 8 11 9 35 35 18 18 18 4 9 18 2 102 102 17 1 16 16 3 182 1834 1841 119 4 1687 1693 62 34 2 2 1 1 23 41 1 1690 62 62 4 3 60 2 41 2 15 15 47 2003 1818 1 171 2 1865 2 2 19 55 4 129 119 1873 1985 293 1699 1 1685 226 15 277 105 185 6 286 2 283 15 4 3 9 102 1852 1953 1943 1952 15 1954 1 5 1943 8 1 98 1 1 1823 1937 1926 6 7 7 96 33 1997 1998 2001 1998 5 5 3 1 3 3 6 5 1 3 3 7 1 7 13 10 1 1 10 14 1 13 12 10 16 15 14 6 8 14 2 36 1 34 34 34 19 192 260 1 261 260 1 1 2 211 58 1896 1 1899 1 3 34 27 1 6 1 11 9 1 1 1 4 1 8 12 17 2 3 15 18 6 2 3 1 3 1 3 3 5 6 6 5 1 2 1 1 4 7 4 7 2 1 2 3 33 16 2 2 2 2 2 1 2 1 4 176 49 4 53 132 1 14 1 1 160 1 13 22 1 7 130 2 2 2 2 2 4 1 2 2 2 1 6 1 2 1 2 1 1 33 2 6 2 15 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 6 2 2 4 3 4 1 1 6 132 176 5 2 3 2 2 27 27 2 2 1 93 34 2 1 1 24 3 6 2 4 14 14 14 1773 1642 177 14 36 5 2 3 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 // SPDX-License-Identifier: GPL-2.0-or-later /* * TUN - Universal TUN/TAP device driver. * Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com> * * $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $ */ /* * Changes: * * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 * Add TUNSETLINK ioctl to set the link encapsulation * * Mark Smith <markzzzsmith@yahoo.com.au> * Use eth_random_addr() for tap MAC address. * * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 * Fixes in packet dropping, queue length setting and queue wakeup. * Increased default tx queue length. * Added ethtool API. * Minor cleanups * * Daniel Podlejski <underley@underley.eu.org> * Modifications for 2.3.99-pre5 kernel. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define DRV_NAME "tun" #define DRV_VERSION "1.6" #define DRV_DESCRIPTION "Universal TUN/TAP device driver" #define DRV_COPYRIGHT "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>" #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/miscdevice.h> #include <linux/ethtool.h> #include <linux/rtnetlink.h> #include <linux/compat.h> #include <linux/if.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_tun.h> #include <linux/if_vlan.h> #include <linux/crc32.h> #include <linux/math.h> #include <linux/nsproxy.h> #include <linux/virtio_net.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <net/ip_tunnels.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/skb_array.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/mutex.h> #include <linux/ieee802154.h> #include <uapi/linux/if_ltalk.h> #include <uapi/linux/if_fddi.h> #include <uapi/linux/if_hippi.h> #include <uapi/linux/if_fc.h> #include <net/ax25.h> #include <net/rose.h> #include <net/6lowpan.h> #include <net/rps.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd); #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) /* TUN device flags */ /* IFF_ATTACH_QUEUE is never stored in device flags, * overload it to mean fasync when stored there. */ #define TUN_FASYNC IFF_ATTACH_QUEUE /* High bits in flags field are unused. */ #define TUN_VNET_LE 0x80000000 #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 struct tap_filter { unsigned int count; /* Number of addrs. Zero means disabled */ u32 mask[2]; /* Mask of the hashed addrs */ unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN]; }; /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal * to max number of VCPUs in guest. */ #define MAX_TAP_QUEUES 256 #define MAX_TAP_FLOWS 4096 #define TUN_FLOW_EXPIRE (3 * HZ) /* A tun_file connects an open character device to a tuntap netdevice. It * also contains all socket related structures (except sock_fprog and tap_filter) * to serve as one transmit queue for tuntap device. The sock_fprog and * tap_filter were kept in tun_struct since they were used for filtering for the * netdevice not for a specific queue (at least I didn't see the requirement for * this). * * RCU usage: * The tun_file and tun_struct are loosely coupled, the pointer from one to the * other can only be read while rcu_read_lock or rtnl_lock is held. */ struct tun_file { struct sock sk; struct socket socket; struct tun_struct __rcu *tun; struct fasync_struct *fasync; /* only used for fasnyc */ unsigned int flags; union { u16 queue_index; unsigned int ifindex; }; struct napi_struct napi; bool napi_enabled; bool napi_frags_enabled; struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct ptr_ring tx_ring; struct xdp_rxq_info xdp_rxq; }; struct tun_page { struct page *page; int count; }; struct tun_flow_entry { struct hlist_node hash_link; struct rcu_head rcu; struct tun_struct *tun; u32 rxhash; u32 rps_rxhash; int queue_index; unsigned long updated ____cacheline_aligned_in_smp; }; #define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1) struct tun_prog { struct rcu_head rcu; struct bpf_prog *prog; }; /* Since the socket were moved to tun_file, to preserve the behavior of persist * device, socket filter, sndbuf and vnet header size were restore when the * file were attached to a persist device. */ struct tun_struct { struct tun_file __rcu *tfiles[MAX_TAP_QUEUES]; unsigned int numqueues; unsigned int flags; kuid_t owner; kgid_t group; struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4) int align; int vnet_hdr_sz; int sndbuf; struct tap_filter txflt; struct sock_fprog fprog; /* protected by rtnl lock */ bool filter_attached; u32 msg_enable; spinlock_t lock; struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]; struct timer_list flow_gc_timer; unsigned long ageing_time; unsigned int numdisabled; struct list_head disabled; void *security; u32 flow_count; u32 rx_batched; atomic_long_t rx_frame_errors; struct bpf_prog __rcu *xdp_prog; struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; /* init args */ struct file *file; struct ifreq *ifr; }; struct veth { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static void tun_flow_init(struct tun_struct *tun); static void tun_flow_uninit(struct tun_struct *tun); static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; struct sk_buff *skb; int received = 0; __skb_queue_head_init(&process_queue); spin_lock(&queue->lock); skb_queue_splice_tail_init(queue, &process_queue); spin_unlock(&queue->lock); while (received < budget && (skb = __skb_dequeue(&process_queue))) { napi_gro_receive(napi, skb); ++received; } if (!skb_queue_empty(&process_queue)) { spin_lock(&queue->lock); skb_queue_splice(&process_queue, queue); spin_unlock(&queue->lock); } return received; } static int tun_napi_poll(struct napi_struct *napi, int budget) { unsigned int received; received = tun_napi_receive(napi, budget); if (received < budget) napi_complete_done(napi, received); return received; } static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, bool napi_en, bool napi_frags) { tfile->napi_enabled = napi_en; tfile->napi_frags_enabled = napi_en && napi_frags; if (napi_en) { netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll); napi_enable(&tfile->napi); } } static void tun_napi_enable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_enable(&tfile->napi); } static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) napi_disable(&tfile->napi); } static void tun_napi_del(struct tun_file *tfile) { if (tfile->napi_enabled) netif_napi_del(&tfile->napi); } static bool tun_napi_frags_enabled(const struct tun_file *tfile) { return tfile->napi_frags_enabled; } #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { return tun->flags & TUN_VNET_BE ? false : virtio_legacy_is_little_endian(); } static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) { int be = !!(tun->flags & TUN_VNET_BE); if (put_user(be, argp)) return -EFAULT; return 0; } static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) { int be; if (get_user(be, argp)) return -EFAULT; if (be) tun->flags |= TUN_VNET_BE; else tun->flags &= ~TUN_VNET_BE; return 0; } #else static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { return virtio_legacy_is_little_endian(); } static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp) { return -EINVAL; } static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp) { return -EINVAL; } #endif /* CONFIG_TUN_VNET_CROSS_LE */ static inline bool tun_is_little_endian(struct tun_struct *tun) { return tun->flags & TUN_VNET_LE || tun_legacy_is_little_endian(tun); } static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val) { return __virtio16_to_cpu(tun_is_little_endian(tun), val); } static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val) { return __cpu_to_virtio16(tun_is_little_endian(tun), val); } static inline u32 tun_hashfn(u32 rxhash) { return rxhash & TUN_MASK_FLOW_ENTRIES; } static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash) { struct tun_flow_entry *e; hlist_for_each_entry_rcu(e, head, hash_link) { if (e->rxhash == rxhash) return e; } return NULL; } static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun, struct hlist_head *head, u32 rxhash, u16 queue_index) { struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e) { netif_info(tun, tx_queued, tun->dev, "create flow: hash %u index %u\n", rxhash, queue_index); e->updated = jiffies; e->rxhash = rxhash; e->rps_rxhash = 0; e->queue_index = queue_index; e->tun = tun; hlist_add_head_rcu(&e->hash_link, head); ++tun->flow_count; } return e; } static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) { netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n", e->rxhash, e->queue_index); hlist_del_rcu(&e->hash_link); kfree_rcu(e, rcu); --tun->flow_count; } static void tun_flow_flush(struct tun_struct *tun) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) tun_flow_delete(tun, e); } spin_unlock_bh(&tun->lock); } static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index) { int i; spin_lock_bh(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { if (e->queue_index == queue_index) tun_flow_delete(tun, e); } } spin_unlock_bh(&tun->lock); } static void tun_flow_cleanup(struct timer_list *t) { struct tun_struct *tun = from_timer(tun, t, flow_gc_timer); unsigned long delay = tun->ageing_time; unsigned long next_timer = jiffies + delay; unsigned long count = 0; int i; spin_lock(&tun->lock); for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) { struct tun_flow_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) { unsigned long this_timer; this_timer = e->updated + delay; if (time_before_eq(this_timer, jiffies)) { tun_flow_delete(tun, e); continue; } count++; if (time_before(this_timer, next_timer)) next_timer = this_timer; } } if (count) mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer)); spin_unlock(&tun->lock); } static void tun_flow_update(struct tun_struct *tun, u32 rxhash, struct tun_file *tfile) { struct hlist_head *head; struct tun_flow_entry *e; unsigned long delay = tun->ageing_time; u16 queue_index = tfile->queue_index; head = &tun->flows[tun_hashfn(rxhash)]; rcu_read_lock(); e = tun_flow_find(head, rxhash); if (likely(e)) { /* TODO: keep queueing to old queue until it's empty? */ if (READ_ONCE(e->queue_index) != queue_index) WRITE_ONCE(e->queue_index, queue_index); if (e->updated != jiffies) e->updated = jiffies; sock_rps_record_flow_hash(e->rps_rxhash); } else { spin_lock_bh(&tun->lock); if (!tun_flow_find(head, rxhash) && tun->flow_count < MAX_TAP_FLOWS) tun_flow_create(tun, head, rxhash, queue_index); if (!timer_pending(&tun->flow_gc_timer)) mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + delay)); spin_unlock_bh(&tun->lock); } rcu_read_unlock(); } /* Save the hash received in the stack receive path and update the * flow_hash table accordingly. */ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) { if (unlikely(e->rps_rxhash != hash)) e->rps_rxhash = hash; } /* We try to identify a flow through its rxhash. The reason that * we do not check rxq no. is because some cards(e.g 82599), chooses * the rxq based on the txq where the last packet of the flow comes. As * the userspace application move between processors, we may get a * different rxq no. here. */ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_flow_entry *e; u32 txq, numqueues; numqueues = READ_ONCE(tun->numqueues); txq = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq); if (e) { tun_flow_save_rps_rxhash(e, txq); txq = e->queue_index; } else { txq = reciprocal_scale(txq, numqueues); } return txq; } static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb) { struct tun_prog *prog; u32 numqueues; u16 ret = 0; numqueues = READ_ONCE(tun->numqueues); if (!numqueues) return 0; prog = rcu_dereference(tun->steering_prog); if (prog) ret = bpf_prog_run_clear_cb(prog->prog, skb); return ret % numqueues; } static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct tun_struct *tun = netdev_priv(dev); u16 ret; rcu_read_lock(); if (rcu_dereference(tun->steering_prog)) ret = tun_ebpf_select_queue(tun, skb); else ret = tun_automq_select_queue(tun, skb); rcu_read_unlock(); return ret; } static inline bool tun_capable(struct tun_struct *tun) { const struct cred *cred = current_cred(); struct net *net = dev_net(tun->dev); if (ns_capable(net->user_ns, CAP_NET_ADMIN)) return 1; if (uid_valid(tun->owner) && uid_eq(cred->euid, tun->owner)) return 1; if (gid_valid(tun->group) && in_egroup_p(tun->group)) return 1; return 0; } static void tun_set_real_num_queues(struct tun_struct *tun) { netif_set_real_num_tx_queues(tun->dev, tun->numqueues); netif_set_real_num_rx_queues(tun->dev, tun->numqueues); } static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile) { tfile->detached = tun; list_add_tail(&tfile->next, &tun->disabled); ++tun->numdisabled; } static struct tun_struct *tun_enable_queue(struct tun_file *tfile) { struct tun_struct *tun = tfile->detached; tfile->detached = NULL; list_del_init(&tfile->next); --tun->numdisabled; return tun; } void tun_ptr_free(void *ptr) { if (!ptr) return; if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); xdp_return_frame(xdpf); } else { __skb_array_destroy_skb(ptr); } } EXPORT_SYMBOL_GPL(tun_ptr_free); static void tun_queue_purge(struct tun_file *tfile) { void *ptr; while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL) tun_ptr_free(ptr); skb_queue_purge(&tfile->sk.sk_write_queue); skb_queue_purge(&tfile->sk.sk_error_queue); } static void __tun_detach(struct tun_file *tfile, bool clean) { struct tun_file *ntfile; struct tun_struct *tun; tun = rtnl_dereference(tfile->tun); if (tun && clean) { if (!tfile->detached) tun_napi_disable(tfile); tun_napi_del(tfile); } if (tun && !tfile->detached) { u16 index = tfile->queue_index; BUG_ON(index >= tun->numqueues); rcu_assign_pointer(tun->tfiles[index], tun->tfiles[tun->numqueues - 1]); ntfile = rtnl_dereference(tun->tfiles[index]); ntfile->queue_index = index; ntfile->xdp_rxq.queue_index = index; rcu_assign_pointer(tun->tfiles[tun->numqueues - 1], NULL); --tun->numqueues; if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); } else { tun_disable_queue(tun, tfile); tun_napi_disable(tfile); } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); /* Drop read queue */ tun_queue_purge(tfile); tun_set_real_num_queues(tun); } else if (tfile->detached && clean) { tun = tun_enable_queue(tfile); sock_put(&tfile->sk); } if (clean) { if (tun && tun->numqueues == 0 && tun->numdisabled == 0) { netif_carrier_off(tun->dev); if (!(tun->flags & IFF_PERSIST) && tun->dev->reg_state == NETREG_REGISTERED) unregister_netdevice(tun->dev); } if (tun) xdp_rxq_info_unreg(&tfile->xdp_rxq); ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free); } } static void tun_detach(struct tun_file *tfile, bool clean) { struct tun_struct *tun; struct net_device *dev; rtnl_lock(); tun = rtnl_dereference(tfile->tun); dev = tun ? tun->dev : NULL; __tun_detach(tfile, clean); if (dev) netdev_state_change(dev); rtnl_unlock(); if (clean) sock_put(&tfile->sk); } static void tun_detach_all(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile, *tmp; int i, n = tun->numqueues; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); BUG_ON(!tfile); tun_napi_disable(tfile); tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); --tun->numqueues; } list_for_each_entry(tfile, &tun->disabled, next) { tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN; tfile->socket.sk->sk_data_ready(tfile->socket.sk); RCU_INIT_POINTER(tfile->tun, NULL); } BUG_ON(tun->numqueues != 0); synchronize_net(); for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tun_napi_del(tfile); /* Drop read queue */ tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); sock_put(&tfile->sk); } BUG_ON(tun->numdisabled != 0); if (tun->flags & IFF_PERSIST) module_put(THIS_MODULE); } static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter, bool napi, bool napi_frags, bool publish_tun) { struct tun_file *tfile = file->private_data; struct net_device *dev = tun->dev; int err; err = security_tun_dev_attach(tfile->socket.sk, tun->security); if (err < 0) goto out; err = -EINVAL; if (rtnl_dereference(tfile->tun) && !tfile->detached) goto out; err = -EBUSY; if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1) goto out; err = -E2BIG; if (!tfile->detached && tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES) goto out; err = 0; /* Re-attach the filter to persist device */ if (!skip_filter && (tun->filter_attached == true)) { lock_sock(tfile->socket.sk); err = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (!err) goto out; } if (!tfile->detached && ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free)) { err = -ENOMEM; goto out; } tfile->queue_index = tun->numqueues; tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN; if (tfile->detached) { /* Re-attach detached tfile, updating XDP queue_index */ WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq)); if (tfile->xdp_rxq.queue_index != tfile->queue_index) tfile->xdp_rxq.queue_index = tfile->queue_index; } else { /* Setup XDP RX-queue info, for new tfile getting attached */ err = xdp_rxq_info_reg(&tfile->xdp_rxq, tun->dev, tfile->queue_index, 0); if (err < 0) goto out; err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq, MEM_TYPE_PAGE_SHARED, NULL); if (err < 0) { xdp_rxq_info_unreg(&tfile->xdp_rxq); goto out; } err = 0; } if (tfile->detached) { tun_enable_queue(tfile); tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); } if (rtnl_dereference(tun->xdp_prog)) sock_set_flag(&tfile->sk, SOCK_XDP); /* device is allowed to go away first, so no need to hold extra * refcnt. */ /* Publish tfile->tun and tun->tfiles only after we've fully * initialized tfile; otherwise we risk using half-initialized * object. */ if (publish_tun) rcu_assign_pointer(tfile->tun, tun); rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile); tun->numqueues++; tun_set_real_num_queues(tun); out: return err; } static struct tun_struct *tun_get(struct tun_file *tfile) { struct tun_struct *tun; rcu_read_lock(); tun = rcu_dereference(tfile->tun); if (tun) dev_hold(tun->dev); rcu_read_unlock(); return tun; } static void tun_put(struct tun_struct *tun) { dev_put(tun->dev); } /* TAP filtering */ static void addr_hash_set(u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; mask[n >> 5] |= (1 << (n & 31)); } static unsigned int addr_hash_test(const u32 *mask, const u8 *addr) { int n = ether_crc(ETH_ALEN, addr) >> 26; return mask[n >> 5] & (1 << (n & 31)); } static int update_filter(struct tap_filter *filter, void __user *arg) { struct { u8 u[ETH_ALEN]; } *addr; struct tun_filter uf; int err, alen, n, nexact; if (copy_from_user(&uf, arg, sizeof(uf))) return -EFAULT; if (!uf.count) { /* Disabled */ filter->count = 0; return 0; } alen = ETH_ALEN * uf.count; addr = memdup_user(arg + sizeof(uf), alen); if (IS_ERR(addr)) return PTR_ERR(addr); /* The filter is updated without holding any locks. Which is * perfectly safe. We disable it first and in the worst * case we'll accept a few undesired packets. */ filter->count = 0; wmb(); /* Use first set of addresses as an exact filter */ for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++) memcpy(filter->addr[n], addr[n].u, ETH_ALEN); nexact = n; /* Remaining multicast addresses are hashed, * unicast will leave the filter disabled. */ memset(filter->mask, 0, sizeof(filter->mask)); for (; n < uf.count; n++) { if (!is_multicast_ether_addr(addr[n].u)) { err = 0; /* no filter */ goto free_addr; } addr_hash_set(filter->mask, addr[n].u); } /* For ALLMULTI just set the mask to all ones. * This overrides the mask populated above. */ if ((uf.flags & TUN_FLT_ALLMULTI)) memset(filter->mask, ~0, sizeof(filter->mask)); /* Now enable the filter */ wmb(); filter->count = nexact; /* Return the number of exact filters */ err = nexact; free_addr: kfree(addr); return err; } /* Returns: 0 - drop, !=0 - accept */ static int run_filter(struct tap_filter *filter, const struct sk_buff *skb) { /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect * at this point. */ struct ethhdr *eh = (struct ethhdr *) skb->data; int i; /* Exact match */ for (i = 0; i < filter->count; i++) if (ether_addr_equal(eh->h_dest, filter->addr[i])) return 1; /* Inexact match (multicast only) */ if (is_multicast_ether_addr(eh->h_dest)) return addr_hash_test(filter->mask, eh->h_dest); return 0; } /* * Checks whether the packet is accepted or not. * Returns: 0 - drop, !=0 - accept */ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb) { if (!filter->count) return 1; return run_filter(filter, skb); } /* Network device part of the driver */ static const struct ethtool_ops tun_ethtool_ops; static int tun_net_init(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); struct ifreq *ifr = tun->ifr; int err; spin_lock_init(&tun->lock); err = security_tun_dev_alloc_security(&tun->security); if (err < 0) return err; tun_flow_init(tun); dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); INIT_LIST_HEAD(&tun->disabled); err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, false); if (err < 0) { tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); return err; } return 0; } /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { tun_detach_all(dev); } /* Net device open. */ static int tun_net_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } /* Net device close. */ static int tun_net_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } /* Net device start xmit */ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb) { #ifdef CONFIG_RPS if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) { /* Select queue was not called for the skbuff, so we extract the * RPS hash and save it into the flow_table here. */ struct tun_flow_entry *e; __u32 rxhash; rxhash = __skb_get_hash_symmetric(skb); e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash); if (e) tun_flow_save_rps_rxhash(e, rxhash); } #endif } static unsigned int run_ebpf_filter(struct tun_struct *tun, struct sk_buff *skb, int len) { struct tun_prog *prog = rcu_dereference(tun->filter_prog); if (prog) len = bpf_prog_run_clear_cb(prog->prog, skb); return len; } /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); enum skb_drop_reason drop_reason; int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len; rcu_read_lock(); tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ if (!tfile) { drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len); /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ if (!check_filter(&tun->txflt, skb)) { drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; } if (tfile->socket.sk->sk_filter && sk_filter(tfile->socket.sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; } len = run_ebpf_filter(tun, skb, len); if (len == 0) { drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; } if (pskb_trim(skb, len)) { drop_reason = SKB_DROP_REASON_NOMEM; goto drop; } if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } skb_tx_timestamp(skb); /* Orphan the skb - required as we might hang on to it * for indefinite time. */ skb_orphan(skb); nf_reset_ct(skb); if (ptr_ring_produce(&tfile->tx_ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } /* dev->lltx requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); rcu_read_unlock(); return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } static void tun_net_mclist(struct net_device *dev) { /* * This callback is supposed to deal with mc filter in * _rx_ path and has nothing to do with the _tx_ path. * In rx path we always accept everything userspace gives us. */ } static netdev_features_t tun_net_fix_features(struct net_device *dev, netdev_features_t features) { struct tun_struct *tun = netdev_priv(dev); return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } static void tun_set_headroom(struct net_device *dev, int new_hr) { struct tun_struct *tun = netdev_priv(dev); if (new_hr < NET_SKB_PAD) new_hr = NET_SKB_PAD; tun->align = new_hr; } static void tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct tun_struct *tun = netdev_priv(dev); dev_get_tstats64(dev, stats); stats->rx_frame_errors += (unsigned long)atomic_long_read(&tun->rx_frame_errors); } static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; struct bpf_prog *old_prog; int i; old_prog = rtnl_dereference(tun->xdp_prog); rcu_assign_pointer(tun->xdp_prog, prog); if (old_prog) bpf_prog_put(old_prog); for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } list_for_each_entry(tfile, &tun->disabled, next) { if (prog) sock_set_flag(&tfile->sk, SOCK_XDP); else sock_reset_flag(&tfile->sk, SOCK_XDP); } return 0; } static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return tun_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) { struct tun_struct *tun = netdev_priv(dev); if (!tun->numqueues) return -EPERM; netif_carrier_on(dev); } else { netif_carrier_off(dev); } return 0; } static const struct net_device_ops tun_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, .ndo_change_carrier = tun_net_change_carrier, }; static void __tun_xdp_flush_tfile(struct tun_file *tfile) { /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN); tfile->socket.sk->sk_data_ready(tfile->socket.sk); } static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { struct tun_struct *tun = netdev_priv(dev); struct tun_file *tfile; u32 numqueues; int nxmit = 0; int i; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; rcu_read_lock(); resample: numqueues = READ_ONCE(tun->numqueues); if (!numqueues) { rcu_read_unlock(); return -ENXIO; /* Caller will free/return all frames */ } tfile = rcu_dereference(tun->tfiles[smp_processor_id() % numqueues]); if (unlikely(!tfile)) goto resample; spin_lock(&tfile->tx_ring.producer_lock); for (i = 0; i < n; i++) { struct xdp_frame *xdp = frames[i]; /* Encode the XDP flag into lowest bit for consumer to differ * XDP buffer from sk_buff. */ void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; } spin_unlock(&tfile->tx_ring.producer_lock); if (flags & XDP_XMIT_FLUSH) __tun_xdp_flush_tfile(tfile); rcu_read_unlock(); return nxmit; } static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) { struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp); int nxmit; if (unlikely(!frame)) return -EOVERFLOW; nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH); if (!nxmit) xdp_return_frame_rx_napi(frame); return nxmit; } static const struct net_device_ops tap_netdev_ops = { .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_set_rx_mode = tun_net_mclist, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_bpf = tun_xdp, .ndo_xdp_xmit = tun_xdp_xmit, .ndo_change_carrier = tun_net_change_carrier, }; static void tun_flow_init(struct tun_struct *tun) { int i; for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) INIT_HLIST_HEAD(&tun->flows[i]); tun->ageing_time = TUN_FLOW_EXPIRE; timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0); mod_timer(&tun->flow_gc_timer, round_jiffies_up(jiffies + tun->ageing_time)); } static void tun_flow_uninit(struct tun_struct *tun) { del_timer_sync(&tun->flow_gc_timer); tun_flow_flush(tun); } #define MIN_MTU 68 #define MAX_MTU 65535 /* Initialize net device. */ static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: dev->netdev_ops = &tun_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; /* Point-to-Point TUN Device */ dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = 1500; /* Zero header length */ dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; break; case IFF_TAP: dev->netdev_ops = &tap_netdev_ops; /* Ethernet TAP Device */ ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; eth_hw_addr_random(dev); /* Currently tun does not support XDP, only tap does. */ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT; break; } dev->min_mtu = MIN_MTU; dev->max_mtu = MAX_MTU - dev->hard_header_len; } static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile) { struct sock *sk = tfile->socket.sk; return (tun->dev->flags & IFF_UP) && sock_writeable(sk); } /* Character device part */ /* Poll */ static __poll_t tun_chr_poll(struct file *file, poll_table *wait) { struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); struct sock *sk; __poll_t mask = 0; if (!tun) return EPOLLERR; sk = tfile->socket.sk; poll_wait(file, sk_sleep(sk), wait); if (!ptr_ring_empty(&tfile->tx_ring)) mask |= EPOLLIN | EPOLLRDNORM; /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to * guarantee EPOLLOUT to be raised by either here or * tun_sock_write_space(). Then process could get notification * after it writes to a down device and meets -EIO. */ if (tun_sock_writeable(tun, tfile) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && tun_sock_writeable(tun, tfile))) mask |= EPOLLOUT | EPOLLWRNORM; if (tun->dev->reg_state != NETREG_REGISTERED) mask = EPOLLERR; tun_put(tun); return mask; } static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, size_t len, const struct iov_iter *it) { struct sk_buff *skb; size_t linear; int err; int i; if (it->nr_segs > MAX_SKB_FRAGS + 1 || len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN)) return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); local_bh_enable(); if (!skb) return ERR_PTR(-ENOMEM); linear = iov_iter_single_seg_count(it); err = __skb_grow(skb, linear); if (err) goto free; skb->len = len; skb->data_len = len - linear; skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { const struct iovec *iov = iter_iov(it) + i; size_t fragsz = iov->iov_len; struct page *page; void *frag; if (fragsz == 0 || fragsz > PAGE_SIZE) { err = -EINVAL; goto free; } frag = netdev_alloc_frag(fragsz); if (!frag) { err = -ENOMEM; goto free; } page = virt_to_head_page(frag); skb_fill_page_desc(skb, i - 1, page, frag - page_address(page), fragsz); } return skb; free: /* frees skb and all frags allocated with napi_alloc_frag() */ napi_free_frags(&tfile->napi); return ERR_PTR(err); } /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, size_t prepad, size_t len, size_t linear, int noblock) { struct sock *sk = tfile->socket.sk; struct sk_buff *skb; int err; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, &err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return ERR_PTR(err); skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, int more) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; struct sk_buff_head process_queue; u32 rx_batched = tun->rx_batched; bool rcv = false; if (!rx_batched || (!more && skb_queue_empty(queue))) { local_bh_disable(); skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); return; } spin_lock(&queue->lock); if (!more || skb_queue_len(queue) == rx_batched) { __skb_queue_head_init(&process_queue); skb_queue_splice_tail_init(queue, &process_queue); rcv = true; } else { __skb_queue_tail(queue, skb); } spin_unlock(&queue->lock); if (rcv) { struct sk_buff *nskb; local_bh_disable(); while ((nskb = __skb_dequeue(&process_queue))) { skb_record_rx_queue(nskb, tfile->queue_index); netif_receive_skb(nskb); } skb_record_rx_queue(skb, tfile->queue_index); netif_receive_skb(skb); local_bh_enable(); } } static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile, int len, int noblock, bool zerocopy) { if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) return false; if (tfile->socket.sk->sk_sndbuf != INT_MAX) return false; if (!noblock) return false; if (zerocopy) return false; if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE) return false; return true; } static struct sk_buff *__tun_build_skb(struct tun_file *tfile, struct page_frag *alloc_frag, char *buf, int buflen, int len, int pad) { struct sk_buff *skb = build_skb(buf, buflen); if (!skb) return ERR_PTR(-ENOMEM); skb_reserve(skb, pad); skb_put(skb, len); skb_set_owner_w(skb, tfile->socket.sk); get_page(alloc_frag->page); alloc_frag->offset += buflen; return skb; } static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, struct xdp_buff *xdp, u32 act) { int err; switch (act) { case XDP_REDIRECT: err = xdp_do_redirect(tun->dev, xdp, xdp_prog); if (err) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_TX: err = tun_xdp_tx(tun->dev, xdp); if (err < 0) { dev_core_stats_rx_dropped_inc(tun->dev); return err; } dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); break; case XDP_PASS: break; default: bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: dev_core_stats_rx_dropped_inc(tun->dev); break; } return act; } static struct sk_buff *tun_build_skb(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *from, struct virtio_net_hdr *hdr, int len, int *skb_xdp) { struct page_frag *alloc_frag = &current->task_frag; struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog *xdp_prog; int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); char *buf; size_t copied; int pad = TUN_RX_PAD; int err = 0; rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) pad += XDP_PACKET_HEADROOM; buflen += SKB_DATA_ALIGN(len + pad); rcu_read_unlock(); alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES); if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL))) return ERR_PTR(-ENOMEM); buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; copied = copy_page_from_iter(alloc_frag->page, alloc_frag->offset + pad, len, from); if (copied != len) return ERR_PTR(-EFAULT); /* There's a small window that XDP may be set after the check * of xdp_prog above, this should be rare and for simplicity * we do XDP on skb in case the headroom is not enough. */ if (hdr->gso_type || !xdp_prog) { *skb_xdp = 1; return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); } *skb_xdp = 0; local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { struct xdp_buff xdp; u32 act; xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq); xdp_prepare_buff(&xdp, buf, pad, len, false); act = bpf_prog_run_xdp(xdp_prog, &xdp); if (act == XDP_REDIRECT || act == XDP_TX) { get_page(alloc_frag->page); alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); if (err < 0) { if (act == XDP_REDIRECT || act == XDP_TX) put_page(alloc_frag->page); goto out; } if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) goto out; pad = xdp.data - xdp.data_hard_start; len = xdp.data_end - xdp.data; } bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); out: bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); return NULL; } /* Get packet from user space buffer */ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, void *msg_control, struct iov_iter *from, int noblock, bool more) { struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; struct virtio_net_hdr gso = { 0 }; int good_linear; int copylen; bool zerocopy = false; int err; u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) return -EINVAL; len -= sizeof(pi); if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); if (len < vnet_hdr_sz) return -EINVAL; len -= vnet_hdr_sz; if (!copy_from_iter_full(&gso, sizeof(gso), from)) return -EFAULT; if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len)) gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2); if (tun16_to_cpu(tun, gso.hdr_len) > len) return -EINVAL; iov_iter_advance(from, vnet_hdr_sz - sizeof(gso)); } if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) { align += NET_IP_ALIGN; if (unlikely(len < ETH_HLEN || (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN))) return -EINVAL; } good_linear = SKB_MAX_HEAD(align); if (msg_control) { struct iov_iter i = *from; /* There are 256 bytes to be copied in skb, so there is * enough room for skb expand head in case it is used. * The rest of the buffer is mapped from userspace. */ copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; linear = copylen; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (!skb) return total_len; } else { if (!zerocopy) { copylen = len; if (tun16_to_cpu(tun, gso.hdr_len) > good_linear) linear = good_linear; else linear = tun16_to_cpu(tun, gso.hdr_len); } if (frags) { mutex_lock(&tfile->napi_mutex); skb = tun_napi_alloc_frags(tfile, copylen, from); /* tun_napi_alloc_frags() enforces a layout for the skb. * If zerocopy is enabled, then this layout will be * overwritten by zerocopy_sg_from_iter(). */ zerocopy = false; } else { if (!linear) linear = min_t(size_t, good_linear, copylen); skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); } err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; } } if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; } switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: if (tun->flags & IFF_NO_PI) { u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0; switch (ip_version) { case 4: pi.proto = htons(ETH_P_IP); break; case 6: pi.proto = htons(ETH_P_IPV6); break; default: err = -EINVAL; goto drop; } } skb_reset_mac_header(skb); skb->protocol = pi.proto; skb->dev = tun->dev; break; case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); break; } /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { struct bpf_prog *xdp_prog; int ret; local_bh_disable(); rcu_read_lock(); xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { rcu_read_unlock(); local_bh_enable(); goto unlock_frags; } } rcu_read_unlock(); local_bh_enable(); } /* Compute the costly rx hash only if needed for flow updates. * We may get a very small possibility of OOO during switching, not * worth to optimize. */ if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); rcu_read_lock(); if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } if (frags) { u32 headlen; /* Exercise flow dissector code path. */ skb_push(skb, ETH_HLEN); headlen = eth_get_headlen(tun->dev, skb->data, skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { WARN_ON_ONCE(1); err = -ENOMEM; dev_core_stats_rx_dropped_inc(tun->dev); napi_busy: napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); return err; } if (likely(napi_schedule_prep(&tfile->napi))) { local_bh_disable(); napi_gro_frags(&tfile->napi); napi_complete(&tfile->napi); local_bh_enable(); } else { err = -EBUSY; goto napi_busy; } mutex_unlock(&tfile->napi_mutex); } else if (tfile->napi_enabled) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; spin_lock_bh(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock_bh(&queue->lock); rcu_read_unlock(); err = -EBUSY; goto free_skb; } __skb_queue_tail(queue, skb); queue_len = skb_queue_len(queue); spin_unlock(&queue->lock); if (!more || queue_len > NAPI_POLL_WEIGHT) napi_schedule(&tfile->napi); local_bh_enable(); } else if (!IS_ENABLED(CONFIG_4KSTACKS)) { tun_rx_batched(tun, tfile, skb, more); } else { netif_rx(skb); } rcu_read_unlock(); preempt_disable(); dev_sw_netstats_rx_add(tun->dev, len); preempt_enable(); if (rxhash) tun_flow_update(tun, rxhash, tfile); return total_len; drop: if (err != -EAGAIN) dev_core_stats_rx_dropped_inc(tun->dev); free_skb: if (!IS_ERR_OR_NULL(skb)) kfree_skb_reason(skb, drop_reason); unlock_frags: if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); } return err ?: total_len; } static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t result; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; result = tun_get_user(tun, tfile, NULL, from, noblock, false); tun_put(tun); return result; } static ssize_t tun_put_user_xdp(struct tun_struct *tun, struct tun_file *tfile, struct xdp_frame *xdp_frame, struct iov_iter *iter) { int vnet_hdr_sz = 0; size_t size = xdp_frame->len; size_t ret; if (tun->flags & IFF_VNET_HDR) { struct virtio_net_hdr gso = { 0 }; vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); if (unlikely(iov_iter_count(iter) < vnet_hdr_sz)) return -EINVAL; if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))) return -EFAULT; iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz; preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, ret); preempt_enable(); return ret; } /* Put packet to the user space buffer */ static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total; int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; if (skb_vlan_tag_present(skb)) vlan_hlen = VLAN_HLEN; if (tun->flags & IFF_VNET_HDR) vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); total = skb->len + vlan_hlen + vnet_hdr_sz; if (!(tun->flags & IFF_NO_PI)) { if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; total += sizeof(pi); if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; } if (vnet_hdr_sz) { struct virtio_net_hdr gso; if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &gso, tun_is_little_endian(tun), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); if (net_ratelimit()) { netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size), tun16_to_cpu(tun, gso.hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true); } WARN_ON_ONCE(1); return -EINVAL; } if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) return -EFAULT; iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); } if (vlan_hlen) { int ret; struct veth veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: /* caller is in process context, */ preempt_disable(); dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen); preempt_enable(); return total; } static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) { DECLARE_WAITQUEUE(wait, current); void *ptr = NULL; int error = 0; ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) goto out; if (noblock) { error = -EAGAIN; goto out; } add_wait_queue(&tfile->socket.wq.wait, &wait); while (1) { set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; if (signal_pending(current)) { error = -ERESTARTSYS; break; } if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) { error = -EFAULT; break; } schedule(); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->socket.wq.wait, &wait); out: *err = error; return ptr; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, struct iov_iter *to, int noblock, void *ptr) { ssize_t ret; int err; if (!iov_iter_count(to)) { tun_ptr_free(ptr); return 0; } if (!ptr) { /* Read frames from ring */ ptr = tun_ring_recv(tfile, noblock, &err); if (!ptr) return err; } if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); ret = tun_put_user_xdp(tun, tfile, xdpf, to); xdp_return_frame(xdpf); } else { struct sk_buff *skb = ptr; ret = tun_put_user(tun, tfile, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tun_file *tfile = file->private_data; struct tun_struct *tun = tun_get(tfile); ssize_t len = iov_iter_count(to), ret; int noblock = 0; if (!tun) return -EBADFD; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tun_do_read(tun, tfile, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; tun_put(tun); return ret; } static void tun_prog_free(struct rcu_head *rcu) { struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu); bpf_prog_destroy(prog->prog); kfree(prog); } static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, struct bpf_prog *prog) { struct tun_prog *old, *new = NULL; if (prog) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; new->prog = prog; } spin_lock_bh(&tun->lock); old = rcu_dereference_protected(*prog_p, lockdep_is_held(&tun->lock)); rcu_assign_pointer(*prog_p, new); spin_unlock_bh(&tun->lock); if (old) call_rcu(&old->rcu, tun_prog_free); return 0; } static void tun_free_netdev(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); BUG_ON(!(list_empty(&tun->disabled))); tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); __tun_set_ebpf(tun, &tun->filter_prog, NULL); } static void tun_setup(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); tun->owner = INVALID_UID; tun->group = INVALID_GID; tun_default_link_ksettings(dev, &tun->link_ksettings); dev->ethtool_ops = &tun_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = tun_free_netdev; /* We prefer our own queue length */ dev->tx_queue_len = TUN_READQ_SIZE; } /* Trivial set of netlink ops to allow deleting tun or tap * device with netlink. */ static int tun_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "tun/tap creation via rtnetlink is not supported."); return -EOPNOTSUPP; } static size_t tun_get_size(const struct net_device *dev) { BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t)); BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t)); return nla_total_size(sizeof(uid_t)) + /* OWNER */ nla_total_size(sizeof(gid_t)) + /* GROUP */ nla_total_size(sizeof(u8)) + /* TYPE */ nla_total_size(sizeof(u8)) + /* PI */ nla_total_size(sizeof(u8)) + /* VNET_HDR */ nla_total_size(sizeof(u8)) + /* PERSIST */ nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */ nla_total_size(sizeof(u32)) + /* NUM_QUEUES */ nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */ 0; } static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK)) goto nla_put_failure; if (uid_valid(tun->owner) && nla_put_u32(skb, IFLA_TUN_OWNER, from_kuid_munged(current_user_ns(), tun->owner))) goto nla_put_failure; if (gid_valid(tun->group) && nla_put_u32(skb, IFLA_TUN_GROUP, from_kgid_munged(current_user_ns(), tun->group))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST))) goto nla_put_failure; if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE, !!(tun->flags & IFF_MULTI_QUEUE))) goto nla_put_failure; if (tun->flags & IFF_MULTI_QUEUE) { if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues)) goto nla_put_failure; if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES, tun->numdisabled)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops tun_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct tun_struct), .setup = tun_setup, .validate = tun_validate, .get_size = tun_get_size, .fill_info = tun_fill_info, }; static void tun_sock_write_space(struct sock *sk) { struct tun_file *tfile; wait_queue_head_t *wqueue; if (!sock_writeable(sk)) return; if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_sync_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); tfile = container_of(sk, struct tun_file, sk); kill_fasync(&tfile->fasync, SIGIO, POLL_OUT); } static void tun_put_page(struct tun_page *tpage) { if (tpage->page) __page_frag_cache_drain(tpage->page, tpage->count); } static int tun_xdp_one(struct tun_struct *tun, struct tun_file *tfile, struct xdp_buff *xdp, int *flush, struct tun_page *tpage) { unsigned int datasize = xdp->data_end - xdp->data; struct tun_xdp_hdr *hdr = xdp->data_hard_start; struct virtio_net_hdr *gso = &hdr->gso; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; u32 rxhash = 0, act; int buflen = hdr->buflen; int ret = 0; bool skb_xdp = false; struct page *page; if (unlikely(datasize < ETH_HLEN)) return -EINVAL; xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { if (gso->gso_type) { skb_xdp = true; goto build; } xdp_init_buff(xdp, buflen, &tfile->xdp_rxq); xdp_set_data_meta_invalid(xdp); act = bpf_prog_run_xdp(xdp_prog, xdp); ret = tun_xdp_act(tun, xdp_prog, xdp, act); if (ret < 0) { put_page(virt_to_head_page(xdp->data)); return ret; } switch (ret) { case XDP_REDIRECT: *flush = true; fallthrough; case XDP_TX: return 0; case XDP_PASS: break; default: page = virt_to_head_page(xdp->data); if (tpage->page == page) { ++tpage->count; } else { tun_put_page(tpage); tpage->page = page; tpage->count = 1; } return 0; } } build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { ret = -ENOMEM; goto out; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; goto out; } skb->protocol = eth_type_trans(skb, tun->dev); skb_reset_network_header(skb); skb_probe_transport_header(skb); skb_record_rx_queue(skb, tfile->queue_index); if (skb_xdp) { ret = do_xdp_generic(xdp_prog, &skb); if (ret != XDP_PASS) { ret = 0; goto out; } } if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && !tfile->detached) rxhash = __skb_get_hash_symmetric(skb); if (tfile->napi_enabled) { queue = &tfile->sk.sk_write_queue; spin_lock(&queue->lock); if (unlikely(tfile->detached)) { spin_unlock(&queue->lock); kfree_skb(skb); return -EBUSY; } __skb_queue_tail(queue, skb); spin_unlock(&queue->lock); ret = 1; } else { netif_receive_skb(skb); ret = 0; } /* No need to disable preemption here since this function is * always called with bh disabled */ dev_sw_netstats_rx_add(tun->dev, datasize); if (rxhash) tun_flow_update(tun, rxhash, tfile); out: return ret; } static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret, i; struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; if (!tun) return -EBADFD; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct tun_page tpage; int n = ctl->num; int flush = 0, queued = 0; memset(&tpage, 0, sizeof(tpage)); local_bh_disable(); rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); for (i = 0; i < n; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage); if (ret > 0) queued += ret; } if (flush) xdp_do_flush(); if (tfile->napi_enabled && queued > 0) napi_schedule(&tfile->napi); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); local_bh_enable(); tun_put_page(&tpage); ret = total_len; goto out; } ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT, m->msg_flags & MSG_MORE); out: tun_put(tun); return ret; } static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun = tun_get(tfile); void *ptr = m->msg_control; int ret; if (!tun) { ret = -EBADFD; goto out_free; } if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) { ret = -EINVAL; goto out_put_tun; } if (flags & MSG_ERRQUEUE) { ret = sock_recv_errqueue(sock->sk, m, total_len, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr); if (ret > (ssize_t)total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } out: tun_put(tun); return ret; out_put_tun: tun_put(tun); out_free: tun_ptr_free(ptr); return ret; } static int tun_ptr_peek_len(void *ptr) { if (likely(ptr)) { if (tun_is_xdp_frame(ptr)) { struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr); return xdpf->len; } return __skb_array_len_with_tag(ptr); } else { return 0; } } static int tun_peek_len(struct socket *sock) { struct tun_file *tfile = container_of(sock, struct tun_file, socket); struct tun_struct *tun; int ret = 0; tun = tun_get(tfile); if (!tun) return 0; ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len); tun_put(tun); return ret; } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tun_socket_ops = { .peek_len = tun_peek_len, .sendmsg = tun_sendmsg, .recvmsg = tun_recvmsg, }; static struct proto tun_proto = { .name = "tun", .owner = THIS_MODULE, .obj_size = sizeof(struct tun_file), }; static int tun_flags(struct tun_struct *tun) { return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return sysfs_emit(buf, "0x%x\n", tun_flags(tun)); } static ssize_t owner_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return uid_valid(tun->owner)? sysfs_emit(buf, "%u\n", from_kuid_munged(current_user_ns(), tun->owner)) : sysfs_emit(buf, "-1\n"); } static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tun_struct *tun = netdev_priv(to_net_dev(dev)); return gid_valid(tun->group) ? sysfs_emit(buf, "%u\n", from_kgid_munged(current_user_ns(), tun->group)) : sysfs_emit(buf, "-1\n"); } static DEVICE_ATTR_RO(tun_flags); static DEVICE_ATTR_RO(owner); static DEVICE_ATTR_RO(group); static struct attribute *tun_dev_attrs[] = { &dev_attr_tun_flags.attr, &dev_attr_owner.attr, &dev_attr_group.attr, NULL }; static const struct attribute_group tun_attr_group = { .attrs = tun_dev_attrs }; static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) { struct tun_struct *tun; struct tun_file *tfile = file->private_data; struct net_device *dev; int err; if (tfile->detached) return -EINVAL; if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!(ifr->ifr_flags & IFF_NAPI) || (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) return -EINVAL; } dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) return -EBUSY; if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops) tun = netdev_priv(dev); else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops) tun = netdev_priv(dev); else return -EINVAL; if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) != !!(tun->flags & IFF_MULTI_QUEUE)) return -EINVAL; if (!tun_capable(tun)) return -EPERM; err = security_tun_dev_open(tun->security); if (err < 0) return err; err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER, ifr->ifr_flags & IFF_NAPI, ifr->ifr_flags & IFF_NAPI_FRAGS, true); if (err < 0) return err; if (tun->flags & IFF_MULTI_QUEUE && (tun->numqueues + tun->numdisabled > 1)) { /* One or more queue has already been attached, no need * to initialize the device again. */ netdev_state_change(dev); return 0; } tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); netdev_state_change(dev); } else { char *name; unsigned long flags = 0; int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; err = security_tun_dev_create(); if (err < 0) return err; /* Set dev type */ if (ifr->ifr_flags & IFF_TUN) { /* TUN device */ flags |= IFF_TUN; name = "tun%d"; } else if (ifr->ifr_flags & IFF_TAP) { /* TAP device */ flags |= IFF_TAP; name = "tap%d"; } else return -EINVAL; if (*ifr->ifr_name) name = ifr->ifr_name; dev = alloc_netdev_mqs(sizeof(struct tun_struct), name, NET_NAME_UNKNOWN, tun_setup, queues, queues); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &tun_link_ops; dev->ifindex = tfile->ifindex; dev->sysfs_groups[0] = &tun_attr_group; tun = netdev_priv(dev); tun->dev = dev; tun->flags = flags; tun->txflt.count = 0; tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr); tun->align = NET_SKB_PAD; tun->filter_attached = false; tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL); tun->ifr = ifr; tun->file = file; tun_net_initialize(dev); err = register_netdevice(tun->dev); if (err < 0) { free_netdev(dev); return err; } /* free_netdev() won't check refcnt, to avoid race * with dev_put() we need publish tun after registration. */ rcu_assign_pointer(tfile->tun, tun); } if (ifr->ifr_flags & IFF_NO_CARRIER) netif_carrier_off(tun->dev); else netif_carrier_on(tun->dev); /* Make sure persistent devices do not get stuck in * xoff state. */ if (netif_running(tun->dev)) netif_tx_wake_all_queues(tun->dev); strcpy(ifr->ifr_name, tun->dev->name); return 0; } static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) { strcpy(ifr->ifr_name, tun->dev->name); ifr->ifr_flags = tun_flags(tun); } /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) { netdev_features_t features = 0; if (arg & TUN_F_CSUM) { features |= NETIF_F_HW_CSUM; arg &= ~TUN_F_CSUM; if (arg & (TUN_F_TSO4|TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) { features |= NETIF_F_TSO_ECN; arg &= ~TUN_F_TSO_ECN; } if (arg & TUN_F_TSO4) features |= NETIF_F_TSO; if (arg & TUN_F_TSO6) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } arg &= ~TUN_F_UFO; /* TODO: for now USO4 and USO6 should work simultaneously */ if (arg & TUN_F_USO4 && arg & TUN_F_USO6) { features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } } /* This gives the user a way to test for new features in future by * trying to set them. */ if (arg) return -EINVAL; tun->set_features = features; tun->dev->wanted_features &= ~TUN_USER_FEATURES; tun->dev->wanted_features |= features; netdev_update_features(tun->dev); return 0; } static void tun_detach_filter(struct tun_struct *tun, int n) { int i; struct tun_file *tfile; for (i = 0; i < n; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); sk_detach_filter(tfile->socket.sk); release_sock(tfile->socket.sk); } tun->filter_attached = false; } static int tun_attach_filter(struct tun_struct *tun) { int i, ret = 0; struct tun_file *tfile; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); lock_sock(tfile->socket.sk); ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); release_sock(tfile->socket.sk); if (ret) { tun_detach_filter(tun, i); return ret; } } tun->filter_attached = true; return ret; } static void tun_set_sndbuf(struct tun_struct *tun) { struct tun_file *tfile; int i; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_sndbuf = tun->sndbuf; } } static int tun_set_queue(struct file *file, struct ifreq *ifr) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; int ret = 0; rtnl_lock(); if (ifr->ifr_flags & IFF_ATTACH_QUEUE) { tun = tfile->detached; if (!tun) { ret = -EINVAL; goto unlock; } ret = security_tun_dev_attach_queue(tun->security); if (ret < 0) goto unlock; ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI, tun->flags & IFF_NAPI_FRAGS, true); } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) { tun = rtnl_dereference(tfile->tun); if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached) ret = -EINVAL; else __tun_detach(tfile, false); } else ret = -EINVAL; if (ret >= 0) netdev_state_change(tun->dev); unlock: rtnl_unlock(); return ret; } static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p, void __user *data) { struct bpf_prog *prog; int fd; if (copy_from_user(&fd, data, sizeof(fd))) return -EFAULT; if (fd == -1) { prog = NULL; } else { prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog)) return PTR_ERR(prog); } return __tun_set_ebpf(tun, prog_p, prog); } /* Return correct value for tun->dev->addr_len based on tun->dev->type. */ static unsigned char tun_get_addr_len(unsigned short type) { switch (type) { case ARPHRD_IP6GRE: case ARPHRD_TUNNEL6: return sizeof(struct in6_addr); case ARPHRD_IPGRE: case ARPHRD_TUNNEL: case ARPHRD_SIT: return 4; case ARPHRD_ETHER: return ETH_ALEN; case ARPHRD_IEEE802154: case ARPHRD_IEEE802154_MONITOR: return IEEE802154_EXTENDED_ADDR_LEN; case ARPHRD_PHONET_PIPE: case ARPHRD_PPP: case ARPHRD_NONE: return 0; case ARPHRD_6LOWPAN: return EUI64_ADDR_LEN; case ARPHRD_FDDI: return FDDI_K_ALEN; case ARPHRD_HIPPI: return HIPPI_ALEN; case ARPHRD_IEEE802: return FC_ALEN; case ARPHRD_ROSE: return ROSE_ADDR_LEN; case ARPHRD_NETROM: return AX25_ADDR_LEN; case ARPHRD_LOCALTLK: return LTALK_ALEN; default: return 0; } } static long __tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg, int ifreq_len) { struct tun_file *tfile = file->private_data; struct net *net = sock_net(&tfile->sk); struct tun_struct *tun; void __user* argp = (void __user*)arg; unsigned int carrier; struct ifreq ifr; kuid_t owner; kgid_t group; int ifindex; int sndbuf; int vnet_hdr_sz; int le; int ret; bool do_notify = false; if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; } else { memset(&ifr, 0, sizeof(ifr)); } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on * TUNSETIFF. */ return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) { return tun_set_queue(file, &ifr); } else if (cmd == SIOCGSKNS) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; return open_related_ns(&net->ns, get_net_ns); } rtnl_lock(); tun = tun_get(tfile); if (cmd == TUNSETIFF) { ret = -EEXIST; if (tun) goto unlock; ifr.ifr_name[IFNAMSIZ-1] = '\0'; ret = tun_set_iff(net, file, &ifr); if (ret) goto unlock; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; goto unlock; } if (cmd == TUNSETIFINDEX) { ret = -EPERM; if (tun) goto unlock; ret = -EFAULT; if (copy_from_user(&ifindex, argp, sizeof(ifindex))) goto unlock; ret = -EINVAL; if (ifindex < 0) goto unlock; ret = 0; tfile->ifindex = ifindex; goto unlock; } ret = -EBADFD; if (!tun) goto unlock; netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd); net = dev_net(tun->dev); ret = 0; switch (cmd) { case TUNGETIFF: tun_get_iff(tun, &ifr); if (tfile->detached) ifr.ifr_flags |= IFF_DETACH_QUEUE; if (!tfile->socket.sk->sk_filter) ifr.ifr_flags |= IFF_NOFILTER; if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case TUNSETNOCSUM: /* Disable/Enable checksum */ /* [unimplemented] */ netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n", arg ? "disabled" : "enabled"); break; case TUNSETPERSIST: /* Disable/Enable persist mode. Keep an extra reference to the * module to prevent the module being unprobed. */ if (arg && !(tun->flags & IFF_PERSIST)) { tun->flags |= IFF_PERSIST; __module_get(THIS_MODULE); do_notify = true; } if (!arg && (tun->flags & IFF_PERSIST)) { tun->flags &= ~IFF_PERSIST; module_put(THIS_MODULE); do_notify = true; } netif_info(tun, drv, tun->dev, "persist %s\n", arg ? "enabled" : "disabled"); break; case TUNSETOWNER: /* Set owner of the device */ owner = make_kuid(current_user_ns(), arg); if (!uid_valid(owner)) { ret = -EINVAL; break; } tun->owner = owner; do_notify = true; netif_info(tun, drv, tun->dev, "owner set to %u\n", from_kuid(&init_user_ns, tun->owner)); break; case TUNSETGROUP: /* Set group of the device */ group = make_kgid(current_user_ns(), arg); if (!gid_valid(group)) { ret = -EINVAL; break; } tun->group = group; do_notify = true; netif_info(tun, drv, tun->dev, "group set to %u\n", from_kgid(&init_user_ns, tun->group)); break; case TUNSETLINK: /* Only allow setting the type when the interface is down */ if (tun->dev->flags & IFF_UP) { netif_info(tun, drv, tun->dev, "Linktype set failed because interface is up\n"); ret = -EBUSY; } else { ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, tun->dev); ret = notifier_to_errno(ret); if (ret) { netif_info(tun, drv, tun->dev, "Refused to change device type\n"); break; } tun->dev->type = (int) arg; tun->dev->addr_len = tun_get_addr_len(tun->dev->type); netif_info(tun, drv, tun->dev, "linktype set to %d\n", tun->dev->type); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, tun->dev); } break; case TUNSETDEBUG: tun->msg_enable = (u32)arg; break; case TUNSETOFFLOAD: ret = set_offload(tun, arg); break; case TUNSETTXFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = update_filter(&tun->txflt, (void __user *)arg); break; case SIOCGIFHWADDR: /* Get hw address */ dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name); if (copy_to_user(argp, &ifr, ifreq_len)) ret = -EFAULT; break; case SIOCSIFHWADDR: /* Set hw address */ ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL); break; case TUNGETSNDBUF: sndbuf = tfile->socket.sk->sk_sndbuf; if (copy_to_user(argp, &sndbuf, sizeof(sndbuf))) ret = -EFAULT; break; case TUNSETSNDBUF: if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) { ret = -EFAULT; break; } if (sndbuf <= 0) { ret = -EINVAL; break; } tun->sndbuf = sndbuf; tun_set_sndbuf(tun); break; case TUNGETVNETHDRSZ: vnet_hdr_sz = tun->vnet_hdr_sz; if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz))) ret = -EFAULT; break; case TUNSETVNETHDRSZ: if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) { ret = -EFAULT; break; } if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) { ret = -EINVAL; break; } tun->vnet_hdr_sz = vnet_hdr_sz; break; case TUNGETVNETLE: le = !!(tun->flags & TUN_VNET_LE); if (put_user(le, (int __user *)argp)) ret = -EFAULT; break; case TUNSETVNETLE: if (get_user(le, (int __user *)argp)) { ret = -EFAULT; break; } if (le) tun->flags |= TUN_VNET_LE; else tun->flags &= ~TUN_VNET_LE; break; case TUNGETVNETBE: ret = tun_get_vnet_be(tun, argp); break; case TUNSETVNETBE: ret = tun_set_vnet_be(tun, argp); break; case TUNATTACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog))) break; ret = tun_attach_filter(tun); break; case TUNDETACHFILTER: /* Can be set only for TAPs */ ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = 0; tun_detach_filter(tun, tun->numqueues); break; case TUNGETFILTER: ret = -EINVAL; if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP) break; ret = -EFAULT; if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog))) break; ret = 0; break; case TUNSETSTEERINGEBPF: ret = tun_set_ebpf(tun, &tun->steering_prog, argp); break; case TUNSETFILTEREBPF: ret = tun_set_ebpf(tun, &tun->filter_prog, argp); break; case TUNSETCARRIER: ret = -EFAULT; if (copy_from_user(&carrier, argp, sizeof(carrier))) goto unlock; ret = tun_net_change_carrier(tun->dev, (bool)carrier); break; case TUNGETDEVNETNS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto unlock; ret = open_related_ns(&net->ns, get_net_ns); break; default: ret = -EINVAL; break; } if (do_notify) netdev_state_change(tun->dev); unlock: rtnl_unlock(); if (tun) tun_put(tun); return ret; } static long tun_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq)); } #ifdef CONFIG_COMPAT static long tun_chr_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case TUNSETIFF: case TUNGETIFF: case TUNSETTXFILTER: case TUNGETSNDBUF: case TUNSETSNDBUF: case SIOCGIFHWADDR: case SIOCSIFHWADDR: arg = (unsigned long)compat_ptr(arg); break; default: arg = (compat_ulong_t)arg; break; } /* * compat_ifreq is shorter than ifreq, so we must not access beyond * the end of that structure. All fields that are used in this * driver are compatible though, we don't need to convert the * contents. */ return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq)); } #endif /* CONFIG_COMPAT */ static int tun_chr_fasync(int fd, struct file *file, int on) { struct tun_file *tfile = file->private_data; int ret; if (on) { ret = file_f_owner_allocate(file); if (ret) goto out; } if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; if (on) { __f_setown(file, task_pid(current), PIDTYPE_TGID, 0); tfile->flags |= TUN_FASYNC; } else tfile->flags &= ~TUN_FASYNC; ret = 0; out: return ret; } static int tun_chr_open(struct inode *inode, struct file * file) { struct net *net = current->nsproxy->net_ns; struct tun_file *tfile; tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto, 0); if (!tfile) return -ENOMEM; if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) { sk_free(&tfile->sk); return -ENOMEM; } mutex_init(&tfile->napi_mutex); RCU_INIT_POINTER(tfile->tun, NULL); tfile->flags = 0; tfile->ifindex = 0; init_waitqueue_head(&tfile->socket.wq.wait); tfile->socket.file = file; tfile->socket.ops = &tun_socket_ops; sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid()); tfile->sk.sk_write_space = tun_sock_write_space; tfile->sk.sk_sndbuf = INT_MAX; file->private_data = tfile; INIT_LIST_HEAD(&tfile->next); sock_set_flag(&tfile->sk, SOCK_ZEROCOPY); /* tun groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; return 0; } static int tun_chr_close(struct inode *inode, struct file *file) { struct tun_file *tfile = file->private_data; tun_detach(tfile, true); return 0; } #ifdef CONFIG_PROC_FS static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file) { struct tun_file *tfile = file->private_data; struct tun_struct *tun; struct ifreq ifr; memset(&ifr, 0, sizeof(ifr)); rtnl_lock(); tun = tun_get(tfile); if (tun) tun_get_iff(tun, &ifr); rtnl_unlock(); if (tun) tun_put(tun); seq_printf(m, "iff:\t%s\n", ifr.ifr_name); } #endif static const struct file_operations tun_fops = { .owner = THIS_MODULE, .read_iter = tun_chr_read_iter, .write_iter = tun_chr_write_iter, .poll = tun_chr_poll, .unlocked_ioctl = tun_chr_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = tun_chr_compat_ioctl, #endif .open = tun_chr_open, .release = tun_chr_close, .fasync = tun_chr_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = tun_chr_show_fdinfo, #endif }; static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", .nodename = "net/tun", .fops = &tun_fops, }; /* ethtool interface */ static void tun_default_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { ethtool_link_ksettings_zero_link_mode(cmd, supported); ethtool_link_ksettings_zero_link_mode(cmd, advertising); cmd->base.speed = SPEED_10000; cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_TP; cmd->base.phy_address = 0; cmd->base.autoneg = AUTONEG_DISABLE; } static int tun_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(cmd, &tun->link_ksettings, sizeof(*cmd)); return 0; } static int tun_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct tun_struct *tun = netdev_priv(dev); memcpy(&tun->link_ksettings, cmd, sizeof(*cmd)); return 0; } static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct tun_struct *tun = netdev_priv(dev); strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); switch (tun->flags & TUN_TYPE_MASK) { case IFF_TUN: strscpy(info->bus_info, "tun", sizeof(info->bus_info)); break; case IFF_TAP: strscpy(info->bus_info, "tap", sizeof(info->bus_info)); break; } } static u32 tun_get_msglevel(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); return tun->msg_enable; } static void tun_set_msglevel(struct net_device *dev, u32 value) { struct tun_struct *tun = netdev_priv(dev); tun->msg_enable = value; } static int tun_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); ec->rx_max_coalesced_frames = tun->rx_batched; return 0; } static int tun_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct tun_struct *tun = netdev_priv(dev); if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT) tun->rx_batched = NAPI_POLL_WEIGHT; else tun->rx_batched = ec->rx_max_coalesced_frames; return 0; } static void tun_get_channels(struct net_device *dev, struct ethtool_channels *channels) { struct tun_struct *tun = netdev_priv(dev); channels->combined_count = tun->numqueues; channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1; } static const struct ethtool_ops tun_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES, .get_drvinfo = tun_get_drvinfo, .get_msglevel = tun_get_msglevel, .set_msglevel = tun_set_msglevel, .get_link = ethtool_op_get_link, .get_channels = tun_get_channels, .get_ts_info = ethtool_op_get_ts_info, .get_coalesce = tun_get_coalesce, .set_coalesce = tun_set_coalesce, .get_link_ksettings = tun_get_link_ksettings, .set_link_ksettings = tun_set_link_ksettings, }; static int tun_queue_resize(struct tun_struct *tun) { struct net_device *dev = tun->dev; struct tun_file *tfile; struct ptr_ring **rings; int n = tun->numqueues + tun->numdisabled; int ret, i; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; for (i = 0; i < tun->numqueues; i++) { tfile = rtnl_dereference(tun->tfiles[i]); rings[i] = &tfile->tx_ring; } list_for_each_entry(tfile, &tun->disabled, next) rings[i++] = &tfile->tx_ring; ret = ptr_ring_resize_multiple_bh(rings, n, dev->tx_queue_len, GFP_KERNEL, tun_ptr_free); kfree(rings); return ret; } static int tun_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tun_struct *tun = netdev_priv(dev); int i; if (dev->rtnl_link_ops != &tun_link_ops) return NOTIFY_DONE; switch (event) { case NETDEV_CHANGE_TX_QUEUE_LEN: if (tun_queue_resize(tun)) return NOTIFY_BAD; break; case NETDEV_UP: for (i = 0; i < tun->numqueues; i++) { struct tun_file *tfile; tfile = rtnl_dereference(tun->tfiles[i]); tfile->socket.sk->sk_write_space(tfile->socket.sk); } break; default: break; } return NOTIFY_DONE; } static struct notifier_block tun_notifier_block __read_mostly = { .notifier_call = tun_device_event, }; static int __init tun_init(void) { int ret = 0; pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION); ret = rtnl_link_register(&tun_link_ops); if (ret) { pr_err("Can't register link_ops\n"); goto err_linkops; } ret = misc_register(&tun_miscdev); if (ret) { pr_err("Can't register misc device %d\n", TUN_MINOR); goto err_misc; } ret = register_netdevice_notifier(&tun_notifier_block); if (ret) { pr_err("Can't register netdevice notifier\n"); goto err_notifier; } return 0; err_notifier: misc_deregister(&tun_miscdev); err_misc: rtnl_link_unregister(&tun_link_ops); err_linkops: return ret; } static void __exit tun_cleanup(void) { misc_deregister(&tun_miscdev); rtnl_link_unregister(&tun_link_ops); unregister_netdevice_notifier(&tun_notifier_block); } /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tun_get_socket(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->socket; } EXPORT_SYMBOL_GPL(tun_get_socket); struct ptr_ring *tun_get_tx_ring(struct file *file) { struct tun_file *tfile; if (file->f_op != &tun_fops) return ERR_PTR(-EINVAL); tfile = file->private_data; if (!tfile) return ERR_PTR(-EBADFD); return &tfile->tx_ring; } EXPORT_SYMBOL_GPL(tun_get_tx_ring); module_init(tun_init); module_exit(tun_cleanup); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); MODULE_ALIAS("devname:net/tun");
1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Zeroplus based devices * * Copyright (c) 2005, 2006 Anssi Hannula <anssi.hannula@gmail.com> */ /* */ #include <linux/hid.h> #include <linux/input.h> #include <linux/slab.h> #include <linux/module.h> #include "hid-ids.h" #ifdef CONFIG_ZEROPLUS_FF struct zpff_device { struct hid_report *report; }; static int zpff_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct zpff_device *zpff = data; int left, right; /* * The following is specified the other way around in the Zeroplus * datasheet but the order below is correct for the XFX Executioner; * however it is possible that the XFX Executioner is an exception */ left = effect->u.rumble.strong_magnitude; right = effect->u.rumble.weak_magnitude; dbg_hid("called with 0x%04x 0x%04x\n", left, right); left = left * 0x7f / 0xffff; right = right * 0x7f / 0xffff; zpff->report->field[2]->value[0] = left; zpff->report->field[3]->value[0] = right; dbg_hid("running with 0x%02x 0x%02x\n", left, right); hid_hw_request(hid, zpff->report, HID_REQ_SET_REPORT); return 0; } static int zpff_init(struct hid_device *hid) { struct zpff_device *zpff; struct hid_report *report; struct hid_input *hidinput; struct input_dev *dev; int i, error; if (list_empty(&hid->inputs)) { hid_err(hid, "no inputs found\n"); return -ENODEV; } hidinput = list_entry(hid->inputs.next, struct hid_input, list); dev = hidinput->input; for (i = 0; i < 4; i++) { report = hid_validate_values(hid, HID_OUTPUT_REPORT, 0, i, 1); if (!report) return -ENODEV; } zpff = kzalloc(sizeof(struct zpff_device), GFP_KERNEL); if (!zpff) return -ENOMEM; set_bit(FF_RUMBLE, dev->ffbit); error = input_ff_create_memless(dev, zpff, zpff_play); if (error) { kfree(zpff); return error; } zpff->report = report; zpff->report->field[0]->value[0] = 0x00; zpff->report->field[1]->value[0] = 0x02; zpff->report->field[2]->value[0] = 0x00; zpff->report->field[3]->value[0] = 0x00; hid_hw_request(hid, zpff->report, HID_REQ_SET_REPORT); hid_info(hid, "force feedback for Zeroplus based devices by Anssi Hannula <anssi.hannula@gmail.com>\n"); return 0; } #else static inline int zpff_init(struct hid_device *hid) { return 0; } #endif static int zp_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF); if (ret) { hid_err(hdev, "hw start failed\n"); goto err; } zpff_init(hdev); return 0; err: return ret; } static const struct hid_device_id zp_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0005) }, { HID_USB_DEVICE(USB_VENDOR_ID_ZEROPLUS, 0x0030) }, { } }; MODULE_DEVICE_TABLE(hid, zp_devices); static struct hid_driver zp_driver = { .name = "zeroplus", .id_table = zp_devices, .probe = zp_probe, }; module_hid_driver(zp_driver); MODULE_DESCRIPTION("Force feedback support for Zeroplus based devices"); MODULE_LICENSE("GPL");
10 64 17 4 9 9 9 8 1 1 5 51 51 4 11 1 10 54 54 4 1 54 1 11 2 7 7 17 5 4 1 5 5 4 4 2 1 1 25 20 11 1 3 9 5 6 3 8 5 1 8 2 7 9 1 1 1 14 5 2 2 5 5 23 1 1 5 5 9 7 5 4 7 17 2 18 14 5 18 18 2 2 2 2 2 2 2 1 2 1 2 2 15 12 4 4 4 18 3 1 1 1 1 13 3 10 13 13 13 13 13 13 22 5 17 4 9 3 19 1 2 6 6 6 9 12 1 1 8 8 1 8 86 2 3 79 51 2 1 1 1 3 1 1 23 1 3 1 5 14 2 14 2 1 1 2 1 1 2 1 1 1 3 13 7 4 2 9 2 2 1 2 1 3 1 1 1 9 2 4 2 1 1 2 1776 1771 412 415 37 1 2 9 9 9 4 4 2 1 1 2 2 3 3 1 1 8 8 7 6 1 4 25 25 2 22 22 2 1 6 19 18 5 1 11 2 6 4 1 4 3 3 26 26 4 6 6 6 1 5 1 23 4 5 7 6 5 1 3 4 4 1 1 24 2 2 19 5 3 3 1 7 3 3 11 2 30 30 10 2 1 7 1 6 14 4 11 6 3 4 2 2 2 2 1 2 5 5 5 5 4 4 51 51 51 51 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 // SPDX-License-Identifier: GPL-2.0-or-later /* * IP multicast routing support for mrouted 3.6/3.8 * * (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk> * Linux Consultancy and Custom Driver Development * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code * Alan Cox : Fixed the clone/copy bug and device race. * Mike McLagan : Routing by source * Malcolm Beattie : Buffer handling fixes. * Alexey Kuznetsov : Double buffer free and other fixes. * SVR Anand : Fixed several multicast bugs and problems. * Alexey Kuznetsov : Status, optimisations and more. * Brad Parker : Better behaviour on mrouted upcall * overflow. * Carlos Picoto : PIMv1 Support * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header * Relax this requirement to work with older peers. */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/cache.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/mroute.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/route.h> #include <net/icmp.h> #include <net/udp.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/export.h> #include <linux/rhashtable.h> #include <net/ip_tunnels.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/rtnh.h> #include <net/inet_dscp.h> #include <linux/nospec.h> struct ipmr_rule { struct fib_rule common; }; struct ipmr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. * Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved * entries is changed only in process context and protected * with weak lock mrt_lock. Queue of unresolved entries is protected * with strong spinlock mfc_unres_lock. * * In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *cache, int local); static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES #define ipmr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv4.mr_tables)) static bool ipmr_can_free_table(struct net *net) { return !check_net(net) || !net_initialized(net); } static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv4.mr_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv4.mr_tables) return NULL; return ret; } static struct mr_table *__ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ipmr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static struct mr_table *ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; rcu_read_lock(); mrt = __ipmr_get_table(net, id); rcu_read_unlock(); return mrt; } static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { int err; struct ipmr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi4_to_flowi(flp4)); err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ipmr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = __ipmr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { return 1; } static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .family = RTNL_FAMILY_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), .action = ipmr_rule_action, .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .owner = THIS_MODULE, }; static int __net_init ipmr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ipmr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv4.mr_tables); mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT); if (err < 0) goto err2; net->ipv4.mr_rules_ops = ops; return 0; err2: rtnl_lock(); ipmr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ipmr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } static unsigned int ipmr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); } bool ipmr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; } EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) static bool ipmr_can_free_table(struct net *net) { return !check_net(net); } static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv4.mrt; return NULL; } static struct mr_table *ipmr_get_table(struct net *net, u32 id) { return net->ipv4.mrt; } #define __ipmr_get_table ipmr_get_table static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { *mrt = net->ipv4.mrt; return 0; } static int __net_init ipmr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv4.mrt = mrt; return 0; } static void __net_exit ipmr_rules_exit(struct net *net) { ASSERT_RTNL(); ipmr_free_table(net->ipv4.mrt); net->ipv4.mrt = NULL; } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ipmr_rules_seq_read(const struct net *net) { return 0; } bool ipmr_rule_default(const struct fib_rule *rule) { return true; } EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc_cache_cmp_arg *cmparg = arg->key; const struct mfc_cache *c = ptr; return cmparg->mfc_mcastgrp != c->mfc_mcastgrp || cmparg->mfc_origin != c->mfc_origin; } static const struct rhashtable_params ipmr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ipmr_hash_cmp, .automatic_shrinking = true, }; static void ipmr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); #endif } static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = { .mfc_mcastgrp = htonl(INADDR_ANY), .mfc_origin = htonl(INADDR_ANY), }; static struct mr_table_ops ipmr_mr_table_ops = { .rht_params = &ipmr_rht_params, .cmparg_any = &ipmr_mr_table_ops_cmparg_any, }; static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (id != RT_TABLE_DEFAULT && id >= 1000000000) return ERR_PTR(-EINVAL); mrt = __ipmr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ipmr_mr_table_ops, ipmr_expire_process, ipmr_new_table_set); } static void ipmr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); WARN_ON_ONCE(!ipmr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ /* Initialize ipmr pimreg/tunnel in_device */ static bool ipmr_init_vif_indev(const struct net_device *dev) { struct in_device *in_dev; ASSERT_RTNL(); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return false; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0; return true; } static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { struct net_device *tunnel_dev, *new_dev; struct ip_tunnel_parm_kern p = { }; int err; tunnel_dev = __dev_get_by_name(net, "tunl0"); if (!tunnel_dev) goto out; p.iph.daddr = v->vifc_rmt_addr.s_addr; p.iph.saddr = v->vifc_lcl_addr.s_addr; p.iph.version = 4; p.iph.ihl = 5; p.iph.protocol = IPPROTO_IPIP; sprintf(p.name, "dvmrp%d", v->vifc_vifi); if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl) goto out; err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCADDTUNNEL); if (err) goto out; new_dev = __dev_get_by_name(net, p.name); if (!new_dev) goto out; new_dev->flags |= IFF_MULTICAST; if (!ipmr_init_vif_indev(new_dev)) goto out_unregister; if (dev_open(new_dev, NULL)) goto out_unregister; dev_hold(new_dev); err = dev_set_allmulti(new_dev, 1); if (err) { dev_close(new_dev); tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, SIOCDELTUNNEL); dev_put(new_dev); new_dev = ERR_PTR(err); } return new_dev; out_unregister: unregister_netdevice(new_dev); out: return ERR_PTR(-ENOBUFS); } #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi4_mark = skb->mark, }; int err; err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { kfree_skb(skb); return err; } DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), IGMPMSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = &reg_vif_netdev_ops; dev->needs_free_netdev = true; dev->netns_local = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT_TABLE_DEFAULT) sprintf(name, "pimreg"); else sprintf(name, "pimreg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (!ipmr_init_vif_indev(dev)) goto failure; if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } /* called with rcu_read_lock() */ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, unsigned int pimlen) { struct net_device *reg_dev = NULL; struct iphdr *encap; int vif_num; encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* Check that: * a. packet is really sent to a multicast group * b. packet is not a NULL-REGISTER * c. packet is not truncated */ if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || ntohs(encap->tot_len) + pimlen > skb->len) return 1; /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */ vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[vif_num]); if (!reg_dev) return 1; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return NET_RX_SUCCESS; } #else static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) { return NULL; } #endif static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, vifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv4.ipmr_seq); } static int call_ipmr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type, &mfc->_c, tb_id, &net->ipv4.ipmr_seq); } /** * vif_delete - Delete a VIF entry * @mrt: Table to delete from * @vifi: VIF identifier to delete * @notify: Set to 1, if the caller is a notifier_call * @head: if unregistering the VIF, place it on this queue */ static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; spin_lock(&mrt_lock); call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); RCU_INIT_POINTER(v->dev, NULL); if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); } if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static void ipmr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc_cache *)c); } static void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->_c.rcu, ipmr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. */ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; struct nlmsgerr *e; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); e = nlmsg_data(nlh); e->error = -ETIMEDOUT; memset(&e->msg, 0, sizeof(e->msg)); rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { kfree_skb(skb); } } ipmr_cache_free(c); } /* Timer process for the unresolved queue. */ static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); struct mr_mfc *c, *next; unsigned long expires; unsigned long now; if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); return; } if (list_empty(&mrt->mfc_unres_queue)) goto out; now = jiffies; expires = 10*HZ; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE); ipmr_destroy_unres(mrt, (struct mfc_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); out: spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXVIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXVIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { struct netdev_phys_item_id ppid = { }; int vifi = vifc->vifc_vifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->vifc_flags) { case VIFF_REGISTER: if (!ipmr_pimsm_enabled()) return -EINVAL; /* Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ipmr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; case VIFF_TUNNEL: dev = ipmr_new_tunnel(net, vifc); if (IS_ERR(dev)) return PTR_ERR(dev); break; case VIFF_USE_IFINDEX: case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); if (dev && !__in_dev_get_rtnl(dev)) { dev_put(dev); return -EADDRNOTAVAIL; } } else { dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); } if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) { dev_put(dev); return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), (VIFF_TUNNEL | VIFF_REGISTER)); err = dev_get_port_parent_id(dev, &ppid, true); if (err == 0) { memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len); v->dev_parent_id.id_len = ppid.id_len; } else { v->dev_parent_id.id_len = 0; } v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); if (v->flags & VIFF_REGISTER) { /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); } if (vifi+1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } /* called with rcu_read_lock() */ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = origin }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, __be32 mcastgrp, int vifi) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = htonl(INADDR_ANY) }; if (mcastgrp == htonl(INADDR_ANY)) return mr_mfc_find_any_parent(mrt, vifi); return mr_mfc_find_any(mrt, vifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, __be32 origin, __be32 mcastgrp, int parent) { struct mfc_cache_cmp_arg arg = { .mfc_mcastgrp = mcastgrp, .mfc_origin = origin, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c) { c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXVIFS; c->_c.free = ipmr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); } return c; } static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c) { skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; } return c; } /* A cache entry has gone into a resolved state from queued */ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc_cache *uc, struct mfc_cache *c) { struct sk_buff *skb; struct nlmsgerr *e; /* Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); e = nlmsg_data(nlh); e->error = -EMSGSIZE; memset(&e->msg, 0, sizeof(e->msg)); } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip_mr_forward(net, mrt, skb->dev, skb, c, 0); rcu_read_unlock(); } } } /* Bounce a cache query up to mrouted and netlink. * * Called under rcu_read_lock(). */ static int ipmr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert) { const int ihl = ip_hdrlen(pkt); struct sock *mroute_sk; struct igmphdr *igmp; struct igmpmsg *msg; struct sk_buff *skb; int ret; mroute_sk = rcu_dereference(mrt->mroute_sk); if (!mroute_sk) return -EINVAL; if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); else skb = alloc_skb(128, GFP_ATOMIC); if (!skb) return -ENOBUFS; if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) { /* Ugly, but we have no choice with this interface. * Duplicate old header, fix ihl, length etc. * And all this only to mangle msg->im_msgtype and * to set msg->im_mbz to "mbz" :-) */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); skb_reset_transport_header(skb); msg = (struct igmpmsg *)skb_network_header(skb); memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; } else { /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ int vif_num = READ_ONCE(mrt->mroute_reg_vif_num); msg->im_vif = vif_num; msg->im_vif_hi = vif_num >> 8; } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); } else { /* Copy the IP header */ skb_set_network_header(skb, skb->len); skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); /* Flag to the kernel this is a route add */ ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; msg->im_vif_hi = vifi >> 8; ipv4_pktinfo_prepare(mroute_sk, pkt, false); memcpy(skb->cb, pkt->cb, sizeof(skb->cb)); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); igmp->type = assert; msg->im_msgtype = assert; igmp->code = 0; ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } igmpmsg_netlink_event(mrt, skb); /* Deliver to mrouted */ ret = sock_queue_rcv_skb(mroute_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ /* Called under rcu_read_lock() */ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); struct mfc_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { found = true; break; } } if (!found) { /* Create a new entry if allowable */ c = ipmr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ipmr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); if (atomic_read(&mrt->cache_resolve_queue_len) == 1) mod_timer(&mrt->ipmr_expire_timer, c->_c.mfc_un.unres.expires); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* MFC cache manipulation by user space mroute daemon */ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params); list_del_rcu(&c->_c.list); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { struct mfc_cache *uc, *c; struct mr_mfc *_uc; bool found; int ret; if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) && !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr)) return -EINVAL; c = ipmr_cache_alloc(); if (!c) return -ENOMEM; c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; c->_c.mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ipmr_rht_params); if (ret) { pr_err("ipmr: rhtable insert error %d\n", ret); ipmr_cache_free(c); return ret; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc_cache *)_uc; if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct net *net = read_pnet(&mrt->net); struct mr_mfc *c, *tmp; struct mfc_cache *cache; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) continue; vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); cache = (struct mfc_cache *)c; call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, mrt->id); mroute_netlink_event(mrt, cache, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); cache = (struct mfc_cache *)c; mroute_netlink_event(mrt, cache, RTM_DELROUTE); ipmr_destroy_unres(mrt, cache); } spin_unlock_bh(&mfc_unres_lock); } } } /* called from ip_ra_control(), before an RCU grace period, * we don't need to call synchronize_rcu() here */ static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct mr_table *mrt; rtnl_lock(); ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); } } rtnl_unlock(); } /* Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { struct net *net = sock_net(sk); int val, ret = 0, parent = 0; struct mr_table *mrt; struct vifctl vif; struct mfcctl mfc; bool do_wrvifwhole; u32 uval; /* There's one exception to the lock - MRT_DONE which needs to unlock */ rtnl_lock(); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_IGMP) { ret = -EOPNOTSUPP; goto out_unlock; } mrt = __ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) { ret = -ENOENT; goto out_unlock; } if (optname != MRT_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) { ret = -EACCES; goto out_unlock; } } switch (optname) { case MRT_INIT: if (optlen != sizeof(int)) { ret = -EINVAL; break; } if (rtnl_dereference(mrt->mroute_sk)) { ret = -EADDRINUSE; break; } ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); } break; case MRT_DONE: if (sk != rcu_access_pointer(mrt->mroute_sk)) { ret = -EACCES; } else { /* We need to unlock here because mrtsock_destruct takes * care of rtnl itself and we can't change that due to * the IP_ROUTER_ALERT setsockopt which runs without it. */ rtnl_unlock(); ret = ip_ra_control(sk, 0, NULL); goto out; } break; case MRT_ADD_VIF: case MRT_DEL_VIF: if (optlen != sizeof(vif)) { ret = -EINVAL; break; } if (copy_from_sockptr(&vif, optval, sizeof(vif))) { ret = -EFAULT; break; } if (vif.vifc_vifi >= MAXVIFS) { ret = -ENFILE; break; } if (optname == MRT_ADD_VIF) { ret = vif_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); } else { ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } break; /* Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; fallthrough; case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { ret = -EINVAL; break; } if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) { ret = -EFAULT; break; } if (parent == 0) parent = mfc.mfcc_parent; if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) ret = ipmr_mfc_delete(mrt, &mfc, parent); else ret = ipmr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); break; case MRT_FLUSH: if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } mroute_clean_tables(mrt, val); break; /* Control PIM assert. */ case MRT_ASSERT: if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } mrt->mroute_do_assert = val; break; case MRT_PIM: if (!ipmr_pimsm_enabled()) { ret = -ENOPROTOOPT; break; } if (optlen != sizeof(val)) { ret = -EINVAL; break; } if (copy_from_sockptr(&val, optval, sizeof(val))) { ret = -EFAULT; break; } do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); val = !!val; if (val != mrt->mroute_do_pim) { mrt->mroute_do_pim = val; mrt->mroute_do_assert = val; mrt->mroute_do_wrvifwhole = do_wrvifwhole; } break; case MRT_TABLE: if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) { ret = -ENOPROTOOPT; break; } if (optlen != sizeof(uval)) { ret = -EINVAL; break; } if (copy_from_sockptr(&uval, optval, sizeof(uval))) { ret = -EFAULT; break; } if (sk == rtnl_dereference(mrt->mroute_sk)) { ret = -EBUSY; } else { mrt = ipmr_new_table(net, uval); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw_sk(sk)->ipmr_table = uval; } break; /* Spurious command, or MRT_VERSION which you cannot set. */ default: ret = -ENOPROTOOPT; } out_unlock: rtnl_unlock(); out: return ret; } /* Execute if this ioctl is a special mroute ioctl */ int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { switch (cmd) { /* These userspace buffers will be consumed by ipmr_ioctl() */ case SIOCGETVIFCNT: { struct sioc_vif_req buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } case SIOCGETSGCNT: { struct sioc_sg_req buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } } /* return code > 0 means that the ioctl was not executed */ return 1; } /* Getsock opt support for the multicast routing system. */ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_IGMP) return -EOPNOTSUPP; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (optname) { case MRT_VERSION: val = 0x0305; break; case MRT_PIM: if (!ipmr_pimsm_enabled()) return -ENOPROTOOPT; val = mrt->mroute_do_pim; break; case MRT_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; if (olr < 0) return -EINVAL; olr = min_t(unsigned int, olr, sizeof(int)); if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* The IP multicast ioctl support routines. */ int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); struct sioc_vif_req *vr; struct sioc_sg_req *sr; struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: vr = (struct sioc_vif_req *)arg; if (vr->vifi >= mrt->maxvif) return -EINVAL; vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->vifi]; if (VIF_EXISTS(mrt, vr->vifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: sr = (struct sioc_sg_req *)arg; rcu_read_lock(); c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req { struct in_addr src; struct in_addr grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_vif_req { vifi_t vifi; /* Which iface */ compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req sr; struct compat_sioc_vif_req vr; struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETVIFCNT: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.vifi >= mrt->maxvif) return -EINVAL; vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.vifi]; if (VIF_EXISTS(mrt, vr.vifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ipmr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) vif_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static struct notifier_block ip_mr_notifier = { .notifier_call = ipmr_device_event, }; /* Encapsulate a packet by attaching a valid IPIP header to it. * This avoids tunnel drivers and other mess and gives us the speed so * important for multicast video. */ static void ip_encap(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct iphdr *iph; const struct iphdr *old_iph = ip_hdr(skb); skb_push(skb, sizeof(struct iphdr)); skb->transport_header = skb->network_header; skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; iph->tos = old_iph->tos; iph->ttl = old_iph->ttl; iph->frag_off = 0; iph->daddr = daddr; iph->saddr = saddr; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); ip_select_ident(net, skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); nf_reset_ct(skb); } static inline int ipmr_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); return dst_output(net, sk, skb); } #ifdef CONFIG_NET_SWITCHDEV static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, int in_vifi, int out_vifi) { struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi]; if (!skb->offload_l3_fwd_mark) return false; if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) return false; return netdev_phys_item_id_same(&out_vif->dev_parent_id, &in_vif->dev_parent_id); } #else static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, int in_vifi, int out_vifi) { return false; } #endif /* Processing handlers for ipmr_forward, under rcu_read_lock() */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, int in_vifi, struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct net_device *dev; struct rtable *rt; struct flowi4 fl4; int encap = 0; vif_dev = vif_dev_read(vif); if (!vif_dev) goto out_free; if (vif->flags & VIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) goto out_free; if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, 0, 0, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); } else { rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, 0, 0, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; } dev = rt->dst.dev; if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear * to blackhole. */ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); goto out_free; } WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. * What do we do with netfilter? -- RR */ if (vif->flags & VIFF_TUNNEL) { ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ DEV_STATS_INC(vif_dev, tx_packets); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); } IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, net, NULL, skb, skb->dev, dev, ipmr_forward_finish); return; out_free: kfree_skb(skb); } /* Called with mrt_lock or rcu_read_lock() */ static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* "local" means that we should preserve one skb (for local delivery) */ /* Called uner rcu_read_lock() */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc_cache *c, int local) { int true_vifi = ipmr_find_vif(mrt, dev); int psend = -1; int vif, ct; vif = c->_c.mfc_parent; atomic_long_inc(&c->_c.mfc_un.res.pkt); atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... * * The best workaround until routing daemons will be * fixed is not to redistribute packet, if it was * send through wrong interface. It means, that * multicast applications WILL NOT work for * (S,G), which have default multicast route pointing * to wrong oif. In any case, it is not a good * idea to use multicasting applications on router. */ goto dont_forward; } atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, * so that we cannot check that packet arrived on an oif. * It is bad, but otherwise we would need to move pretty * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); if (mrt->mroute_do_wrvifwhole) ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRVIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* Forward the frame */ if (c->mfc_origin == htonl(INADDR_ANY) && c->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((c->mfc_origin != htonl(INADDR_ANY) || ct != true_vifi) && ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, psend); } else { ipmr_queue_xmit(net, mrt, true_vifi, skb, psend); return; } } dont_forward: if (!local) kfree_skb(skb); } static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? LOOPBACK_IFINDEX : skb->dev->ifindex), .flowi4_mark = skb->mark, }; struct mr_table *mrt; int err; err = ipmr_fib_lookup(net, &fl4, &mrt); if (err) return ERR_PTR(err); return mrt; } /* Multicast packets for forwarding arrive here * Called with rcu_read_lock(); */ int ip_mr_input(struct sk_buff *skb) { struct mfc_cache *cache; struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; struct net_device *dev; /* skb->dev passed in is the loX master dev for vrfs. * As there are no vifs associated with loopback devices, * get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. */ if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) { kfree_skb(skb); return PTR_ERR(mrt); } if (!local) { if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) return 0; } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { /* IGMPv1 (and broken IGMPv2 implementations sort of * Cisco IOS <= 11.2(8)) do not put router alert * option to IGMP packets destined to routable * groups. It is very bad, because it means * that we can forward NO IGMP messages. */ struct sock *mroute_sk; mroute_sk = rcu_dereference(mrt->mroute_sk); if (mroute_sk) { nf_reset_ct(skb); raw_rcv(mroute_sk, skb); return 0; } } } /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); if (!cache) { int vif = ipmr_find_vif(mrt, dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, vif); } /* No usable cache entry */ if (!cache) { int vif; if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ip_local_deliver(skb); if (!skb2) return -ENOBUFS; skb = skb2; } vif = ipmr_find_vif(mrt, dev); if (vif >= 0) return ipmr_cache_unresolved(mrt, vif, skb, dev); kfree_skb(skb); return -ENODEV; } ip_mr_forward(net, mrt, dev, skb, cache, local); if (local) return ip_local_deliver(skb); return 0; dont_forward: if (local) return ip_local_deliver(skb); kfree_skb(skb); return 0; } #ifdef CONFIG_IP_PIMSM_V1 /* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; struct net *net = dev_net(skb->dev); struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = igmp_hdr(skb); mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (!mrt->mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } return 0; } #endif #ifdef CONFIG_IP_PIMSM_V2 static int pim_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct net *net = dev_net(skb->dev); struct mr_table *mrt; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) || (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; mrt = ipmr_rt_fib_lookup(net, skb); if (IS_ERR(mrt)) goto drop; if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); } return 0; } #endif int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid) { struct mfc_cache *cache; struct mr_table *mrt; int err; rcu_read_lock(); mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) { rcu_read_unlock(); return -ENOENT; } cache = ipmr_cache_find(mrt, saddr, daddr); if (!cache && skb->dev) { int vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, daddr, vif); } if (!cache) { struct sk_buff *skb2; struct iphdr *iph; struct net_device *dev; int vif = -1; dev = skb->dev; if (dev) vif = ipmr_find_vif(mrt, dev); if (vif < 0) { rcu_read_unlock(); return -ENODEV; } skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr)); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_push(skb2, sizeof(struct iphdr)); skb_reset_network_header(skb2); iph = ip_hdr(skb2); iph->ihl = sizeof(struct iphdr) >> 2; iph->saddr = saddr; iph->daddr = daddr; iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IPMR; rtm->rtm_dst_len = 32; rtm->rtm_src_len = 32; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c, cmd, flags); } static size_t mroute_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_SRC */ + nla_total_size(4) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); } static size_t igmpmsg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IPMRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct igmpmsg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct igmpmsg); msg = (struct igmpmsg *)skb_network_header(pkt); skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IPMR; if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) || nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, msg->im_dst.s_addr) || nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS); } static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_SRC: case RTA_DST: case RTA_TABLE: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request"); return -EINVAL; } } return 0; } static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX + 1]; struct sk_buff *skb = NULL; struct mfc_cache *cache; struct mr_table *mrt; __be32 src, grp; u32 tableid; int err; err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; src = nla_get_in_addr_default(tb[RTA_SRC], 0); grp = nla_get_in_addr_default(tb[RTA_DST], 0); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); if (!mrt) { err = -ENOENT; goto errout_free; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ipmr_cache_find(mrt, src, grp); rcu_read_unlock(); if (!cache) { err = -ENOENT; goto errout_free; } skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout_free; } err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) goto errout_free; err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; errout_free: kfree_skb(skb); goto errout; } static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .rtnl_held = true, }; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) return skb->len; NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, _ipmr_fill_mroute, &mfc_unres_lock, &filter); } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { [RTA_SRC] = { .type = NLA_U32 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, }; static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol) { switch (rtm_protocol) { case RTPROT_STATIC: case RTPROT_MROUTED: return true; } return false; } static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc) { struct rtnexthop *rtnh = nla_data(nla); int remaining = nla_len(nla), vifi = 0; while (rtnh_ok(rtnh, remaining)) { mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops; if (++vifi == MAXVIFS) break; rtnh = rtnh_next(rtnh, &remaining); } return remaining > 0 ? -EINVAL : vifi; } /* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct mfcctl *mfcc, int *mrtsock, struct mr_table **mrtret, struct netlink_ext_ack *extack) { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; struct mr_table *mrt; struct nlattr *attr; struct rtmsg *rtm; int ret, rem; ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, extack); if (ret < 0) goto out; rtm = nlmsg_data(nlh); ret = -EINVAL; if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 || rtm->rtm_type != RTN_MULTICAST || rtm->rtm_scope != RT_SCOPE_UNIVERSE || !ipmr_rtm_validate_proto(rtm->rtm_protocol)) goto out; memset(mfcc, 0, sizeof(*mfcc)); mfcc->mfcc_parent = -1; ret = 0; nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) { switch (nla_type(attr)) { case RTA_SRC: mfcc->mfcc_origin.s_addr = nla_get_be32(attr); break; case RTA_DST: mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); break; case RTA_IIF: dev = __dev_get_by_index(net, nla_get_u32(attr)); if (!dev) { ret = -ENODEV; goto out; } break; case RTA_MULTIPATH: if (ipmr_nla_get_ttls(attr, mfcc) < 0) { ret = -EINVAL; goto out; } break; case RTA_PREFSRC: ret = 1; break; case RTA_TABLE: tblid = nla_get_u32(attr); break; } } mrt = __ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; goto out; } *mrtret = mrt; *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; if (dev) mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); out: return ret; } /* takes care of both newroute and delroute */ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int ret, mrtsock, parent; struct mr_table *tbl; struct mfcctl mfcc; mrtsock = 0; tbl = NULL; ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; parent = ret ? mfcc.mfcc_parent : -1; if (nlh->nlmsg_type == RTM_NEWROUTE) return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); else return ipmr_mfc_delete(tbl, &mfcc, parent); } static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) { u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len); if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, mrt->mroute_reg_vif_num) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, mrt->mroute_do_assert) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, mrt->mroute_do_wrvifwhole)) return false; return true; } static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) { struct net_device *vif_dev; struct nlattr *vif_nest; struct vif_device *vif; vif = &mrt->vif_table[vifid]; vif_dev = rtnl_dereference(vif->dev); /* if the VIF doesn't exist just continue */ if (!vif_dev) return true; vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF); if (!vif_nest) return false; if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in, IPMRA_VIFA_PAD) || nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out, IPMRA_VIFA_PAD) || nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) || nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) { nla_nest_cancel(skb, vif_nest); return false; } nla_nest_end(skb, vif_nest); return true; } static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); return -EINVAL; } return 0; } static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh = NULL; unsigned int t = 0, s_t; unsigned int e = 0, s_e; struct mr_table *mrt; if (cb->strict_check) { int err = ipmr_valid_dumplink(cb->nlh, cb->extack); if (err < 0) return err; } s_t = cb->args[0]; s_e = cb->args[1]; ipmr_for_each_table(mrt, net) { struct nlattr *vifs, *af; struct ifinfomsg *hdr; u32 i; if (t < s_t) goto skip_table; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWLINK, sizeof(*hdr), NLM_F_MULTI); if (!nlh) break; hdr = nlmsg_data(nlh); memset(hdr, 0, sizeof(*hdr)); hdr->ifi_family = RTNL_FAMILY_IPMR; af = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af) { nlmsg_cancel(skb, nlh); goto out; } if (!ipmr_fill_table(mrt, skb)) { nlmsg_cancel(skb, nlh); goto out; } vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS); if (!vifs) { nla_nest_end(skb, af); nlmsg_end(skb, nlh); goto out; } for (i = 0; i < mrt->maxvif; i++) { if (e < s_e) goto skip_entry; if (!ipmr_fill_vif(mrt, i, skb)) { nla_nest_end(skb, vifs); nla_nest_end(skb, af); nlmsg_end(skb, nlh); goto out; } skip_entry: e++; } s_e = 0; e = 0; nla_nest_end(skb, vifs); nla_nest_end(skb, af); nlmsg_end(skb, nlh); skip_table: t++; } out: cb->args[1] = e; cb->args[0] = t; return skb->len; } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing : * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; rcu_read_lock(); mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) { rcu_read_unlock(); return ERR_PTR(-ENOENT); } iter->mrt = mrt; return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags, vif->local, vif->remote); } return 0; } static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, .next = mr_vif_seq_next, .stop = ipmr_vif_seq_stop, .show = ipmr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group Origin Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; seq_printf(seq, "%08X %08X %-3hd", (__force u32) mfc->mfc_mcastgrp, (__force u32) mfc->mfc_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", atomic_long_read(&mfc->_c.mfc_un.res.pkt), atomic_long_read(&mfc->_c.mfc_un.res.bytes), atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IP_PIMSM_V2 static const struct net_protocol pim_protocol = { .handler = pim_rcv, }; #endif static unsigned int ipmr_seq_read(const struct net *net) { return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, ipmr_mr_table_iter, extack); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { .family = RTNL_FAMILY_IPMR, .fib_seq_read = ipmr_seq_read, .fib_dump = ipmr_dump, .owner = THIS_MODULE, }; static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv4.ipmr_seq = 0; ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv4.ipmr_notifier_ops = ops; return 0; } static void __net_exit ipmr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); net->ipv4.ipmr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; err = ipmr_notifier_init(net); if (err) goto ipmr_notifier_fail; err = ipmr_rules_init(net); if (err < 0) goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ipmr_rules_exit(net); rtnl_unlock(); #endif ipmr_rules_fail: ipmr_notifier_exit(net); ipmr_notifier_fail: return err; } static void __net_exit ipmr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif ipmr_notifier_exit(net); } static void __net_exit ipmr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ipmr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, .exit_batch = ipmr_net_exit_batch, }; static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, .dumpit = ipmr_rtm_dumplink}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, .doit = ipmr_rtm_route}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, .doit = ipmr_rtm_route}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute}, }; int __init ip_mr_init(void) { int err; mrt_cachep = KMEM_CACHE(mfc_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = register_pernet_subsys(&ipmr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IP_PIMSM_V2 if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif rtnl_register_many(ipmr_rtnl_msg_handlers); return 0; #ifdef CONFIG_IP_PIMSM_V2 add_proto_fail: unregister_netdevice_notifier(&ip_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ipmr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; }
58 59 537 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TRACE_EVENT_H #define _LINUX_TRACE_EVENT_H #include <linux/ring_buffer.h> #include <linux/trace_seq.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/perf_event.h> #include <linux/tracepoint.h> struct trace_array; struct array_buffer; struct tracer; struct dentry; struct bpf_prog; union bpf_attr; /* Used for event string fields when they are NULL */ #define EVENT_NULL_STR "(null)" const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, const struct trace_print_flags_u64 *flag_array); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, const struct trace_print_flags_u64 *symbol_array); #endif const char *trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, unsigned int bitmask_size); const char *trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len, bool concatenate); const char *trace_print_array_seq(struct trace_seq *p, const void *buf, int count, size_t el_size); const char * trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); struct trace_iterator; struct trace_event; int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *event); extern __printf(2, 3) void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...); /* Used to find the offset and length of dynamic fields in trace events */ struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN u16 len; u16 offset; #else u16 offset; u16 len; #endif } __packed; /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: * * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; }; #define TRACE_EVENT_TYPE_MAX \ ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: */ struct trace_iterator { struct trace_array *tr; struct tracer *trace; struct array_buffer *array_buffer; void *private; int cpu_file; struct mutex mutex; struct ring_buffer_iter **buffer_iter; unsigned long iter_flags; void *temp; /* temp holder */ unsigned int temp_size; char *fmt; /* modified format holder */ unsigned int fmt_size; atomic_t wait_index; /* trace_seq for __print_flags() and __print_symbolic() etc. */ struct trace_seq tmp_seq; cpumask_var_t started; /* Set when the file is closed to prevent new waiters */ bool closed; /* it's true when current open file is snapshot */ bool snapshot; /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; unsigned long lost_events; int leftover; int ent_size; int cpu; u64 ts; loff_t pos; long idx; /* All new field here will be zeroed out in pipe_read */ }; enum trace_iter_flags { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, TRACE_FILE_TIME_IN_NS = 4, }; typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags, struct trace_event *event); struct trace_event_functions { trace_print_func trace; trace_print_func raw; trace_print_func hex; trace_print_func binary; }; struct trace_event { struct hlist_node node; int type; struct trace_event_functions *funcs; }; extern int register_trace_event(struct trace_event *event); extern int unregister_trace_event(struct trace_event *event); /* Return values for print_line callback */ enum print_line_t { TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ TRACE_TYPE_HANDLED = 1, TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; enum print_line_t trace_handle_return(struct trace_seq *s); static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, unsigned int trace_ctx) { entry->preempt_count = trace_ctx & 0xff; entry->pid = current->pid; entry->type = type; entry->flags = trace_ctx >> 16; } unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_NEED_RESCHED_LAZY = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, TRACE_FLAG_BH_OFF = 0x80, }; static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) { unsigned int irq_status = irqs_disabled_flags(irqflags) ? TRACE_FLAG_IRQS_OFF : 0; return tracing_gen_ctx_irq_test(irq_status); } static inline unsigned int tracing_gen_ctx(void) { unsigned long irqflags; local_save_flags(irqflags); return tracing_gen_ctx_flags(irqflags); } static inline unsigned int tracing_gen_ctx_dec(void) { unsigned int trace_ctx; trace_ctx = tracing_gen_ctx(); /* * Subtract one from the preemption counter if preemption is enabled, * see trace_event_buffer_reserve()for details. */ if (IS_ENABLED(CONFIG_PREEMPTION)) trace_ctx--; return trace_ctx; } struct trace_event_file; struct ring_buffer_event * trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer, struct trace_event_file *trace_file, int type, unsigned long len, unsigned int trace_ctx); #define TRACE_RECORD_CMDLINE BIT(0) #define TRACE_RECORD_TGID BIT(1) void tracing_record_taskinfo(struct task_struct *task, int flags); void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags); void tracing_record_cmdline(struct task_struct *task); void tracing_record_tgid(struct task_struct *task); int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) __printf(3, 4); struct event_filter; enum trace_reg { TRACE_REG_REGISTER, TRACE_REG_UNREGISTER, #ifdef CONFIG_PERF_EVENTS TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, /* * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a * custom action was taken and the default action is not to be * performed. */ TRACE_REG_PERF_ADD, TRACE_REG_PERF_DEL, #endif }; struct trace_event_call; #define TRACE_FUNCTION_TYPE ((const char *)~0UL) struct trace_event_fields { const char *type; union { struct { const char *name; const int size; const int align; const unsigned int is_signed:1; unsigned int needs_test:1; const int filter_type; const int len; }; int (*define_fields)(struct trace_event_call *); }; }; struct trace_event_class { const char *system; void *probe; #ifdef CONFIG_PERF_EVENTS void *perf_probe; #endif int (*reg)(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_fields *fields_array; struct list_head *(*get_fields)(struct trace_event_call *); struct list_head fields; int (*raw_init)(struct trace_event_call *); }; extern int trace_event_reg(struct trace_event_call *event, enum trace_reg type, void *data); struct trace_event_buffer { struct trace_buffer *buffer; struct ring_buffer_event *event; struct trace_event_file *trace_file; void *entry; unsigned int trace_ctx; struct pt_regs *regs; }; void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, struct trace_event_file *trace_file, unsigned long len); void trace_event_buffer_commit(struct trace_event_buffer *fbuffer); enum { TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, TRACE_EVENT_FL_DYNAMIC_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, TRACE_EVENT_FL_TEST_STR_BIT, }; /* * Event flags: * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For trace internal events, do not enable with debugfs file * TRACEPOINT - Event is a tracepoint * DYNAMIC - Event is a dynamic event (created at run time) * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe * EPROBE - Event is an event probe * FPROBE - Event is an function probe * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), TRACE_EVENT_FL_DYNAMIC = (1 << TRACE_EVENT_FL_DYNAMIC_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) struct trace_event_call { struct list_head list; struct trace_event_class *class; union { const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; struct trace_event event; char *print_fmt; /* * Static events can disappear with modules, * where as dynamic ones need their own ref count. */ union { void *module; atomic_t refcnt; }; void *data; /* See the TRACE_EVENT_FL_* flags above */ int flags; /* static flags of different events */ #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; struct bpf_prog_array __rcu *prog_array; int (*perf_perm)(struct trace_event_call *, struct perf_event *); #endif }; #ifdef CONFIG_DYNAMIC_EVENTS bool trace_event_dyn_try_get_ref(struct trace_event_call *call); void trace_event_dyn_put_ref(struct trace_event_call *call); bool trace_event_dyn_busy(struct trace_event_call *call); #else static inline bool trace_event_dyn_try_get_ref(struct trace_event_call *call) { /* Without DYNAMIC_EVENTS configured, nothing should be calling this */ return false; } static inline void trace_event_dyn_put_ref(struct trace_event_call *call) { } static inline bool trace_event_dyn_busy(struct trace_event_call *call) { /* Nothing should call this without DYNAIMIC_EVENTS configured. */ return true; } #endif static inline bool trace_event_try_get_ref(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_DYNAMIC) return trace_event_dyn_try_get_ref(call); else return try_module_get(call->module); } static inline void trace_event_put_ref(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_DYNAMIC) trace_event_dyn_put_ref(call); else module_put(call->module); } #ifdef CONFIG_PERF_EVENTS static inline bool bpf_prog_array_valid(struct trace_event_call *call) { /* * This inline function checks whether call->prog_array * is valid or not. The function is called in various places, * outside rcu_read_lock/unlock, as a heuristic to speed up execution. * * If this function returns true, and later call->prog_array * becomes false inside rcu_read_lock/unlock region, * we bail out then. If this function return false, * there is a risk that we might miss a few events if the checking * were delayed until inside rcu_read_lock/unlock region and * call->prog_array happened to become non-NULL then. * * Here, READ_ONCE() is used instead of rcu_access_pointer(). * rcu_access_pointer() requires the actual definition of * "struct bpf_prog_array" while READ_ONCE() only needs * a declaration of the same type. */ return !!READ_ONCE(call->prog_array); } #endif static inline const char * trace_event_name(struct trace_event_call *call) { if (call->flags & TRACE_EVENT_FL_CUSTOM) return call->name; else if (call->flags & TRACE_EVENT_FL_TRACEPOINT) return call->tp ? call->tp->name : NULL; else return call->name; } static inline struct list_head * trace_get_fields(struct trace_event_call *event_call) { if (!event_call->class->get_fields) return &event_call->class->fields; return event_call->class->get_fields(event_call); } struct trace_subsystem_dir; enum { EVENT_FILE_FL_ENABLED_BIT, EVENT_FILE_FL_RECORDED_CMD_BIT, EVENT_FILE_FL_RECORDED_TGID_BIT, EVENT_FILE_FL_FILTERED_BIT, EVENT_FILE_FL_NO_SET_FILTER_BIT, EVENT_FILE_FL_SOFT_MODE_BIT, EVENT_FILE_FL_SOFT_DISABLED_BIT, EVENT_FILE_FL_TRIGGER_MODE_BIT, EVENT_FILE_FL_TRIGGER_COND_BIT, EVENT_FILE_FL_PID_FILTER_BIT, EVENT_FILE_FL_WAS_ENABLED_BIT, EVENT_FILE_FL_FREED_BIT, }; extern struct trace_event_file *trace_get_event_file(const char *instance, const char *system, const char *event); extern void trace_put_event_file(struct trace_event_file *file); #define MAX_DYNEVENT_CMD_LEN (2048) enum dynevent_type { DYNEVENT_TYPE_SYNTH = 1, DYNEVENT_TYPE_KPROBE, DYNEVENT_TYPE_NONE, }; struct dynevent_cmd; typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd); struct dynevent_cmd { struct seq_buf seq; const char *event_name; unsigned int n_fields; enum dynevent_type type; dynevent_create_fn_t run_command; void *private_data; }; extern int dynevent_create(struct dynevent_cmd *cmd); extern int synth_event_delete(const char *name); extern void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); extern int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, ...); #define synth_event_gen_cmd_start(cmd, name, mod, ...) \ __synth_event_gen_cmd_start(cmd, name, mod, ## __VA_ARGS__, NULL) struct synth_field_desc { const char *type; const char *name; }; extern int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, struct module *mod, struct synth_field_desc *fields, unsigned int n_fields); extern int synth_event_create(const char *name, struct synth_field_desc *fields, unsigned int n_fields, struct module *mod); extern int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, const char *name); extern int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name); extern int synth_event_add_fields(struct dynevent_cmd *cmd, struct synth_field_desc *fields, unsigned int n_fields); #define synth_event_gen_cmd_end(cmd) \ dynevent_create(cmd) struct synth_event; struct synth_event_trace_state { struct trace_event_buffer fbuffer; struct synth_trace_event *entry; struct trace_buffer *buffer; struct synth_event *event; unsigned int cur_field; unsigned int n_u64; bool disabled; bool add_next; bool add_name; }; extern int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...); extern int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals); extern int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state); extern int synth_event_add_next_val(u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_add_val(const char *field_name, u64 val, struct synth_event_trace_state *trace_state); extern int synth_event_trace_end(struct synth_event_trace_state *trace_state); extern int kprobe_event_delete(const char *name); extern void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen); #define kprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, false, name, loc, ## __VA_ARGS__, NULL) #define kretprobe_event_gen_cmd_start(cmd, name, loc, ...) \ __kprobe_event_gen_cmd_start(cmd, true, name, loc, ## __VA_ARGS__, NULL) extern int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, const char *name, const char *loc, ...); #define kprobe_event_add_fields(cmd, ...) \ __kprobe_event_add_fields(cmd, ## __VA_ARGS__, NULL) #define kprobe_event_add_field(cmd, field) \ __kprobe_event_add_fields(cmd, field, NULL) extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...); #define kprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) #define kretprobe_event_gen_cmd_end(cmd) \ dynevent_create(cmd) /* * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch * RECORDED_TGID - The tgids should be recorded at sched_switch * FILTERED - The event has a filter attached * NO_SET_FILTER - Set when filter has error and is to be ignored * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED * SOFT_DISABLED - When set, do not trace the event (even though its * tracepoint may be enabled) * TRIGGER_MODE - When set, invoke the triggers associated with the event * TRIGGER_COND - When set, one or more triggers has an associated filter * PID_FILTER - When set, the event is filtered based on pid * WAS_ENABLED - Set when enabled to know to clear trace on module removal * FREED - File descriptor is freed, all fields should be considered invalid */ enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT), EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT), EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT), EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT), EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT), EVENT_FILE_FL_WAS_ENABLED = (1 << EVENT_FILE_FL_WAS_ENABLED_BIT), EVENT_FILE_FL_FREED = (1 << EVENT_FILE_FL_FREED_BIT), }; struct trace_event_file { struct list_head list; struct trace_event_call *event_call; struct event_filter __rcu *filter; struct eventfs_inode *ei; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; /* * 32 bit flags: * bit 0: enabled * bit 1: enabled cmd record * bit 2: enable/disable with the soft disable bit * bit 3: soft disabled * bit 4: trigger enabled * * Note: The bits must be set atomically to prevent races * from other writers. Reads of flags do not need to be in * sync as they occur in critical sections. But the way flags * is currently used, these changes do not affect the code * except that when a change is made, it may have a slight * delay in propagating the changes to other CPUs due to * caching and such. Which is mostly OK ;-) */ unsigned long flags; refcount_t ref; /* ref count for opened files */ atomic_t sm_ref; /* soft-mode reference counter */ atomic_t tm_ref; /* trigger-mode reference counter */ }; #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ event_##name.flags |= value; \ return 0; \ } \ early_initcall(trace_init_flags_##name); #define __TRACE_EVENT_PERF_PERM(name, expr...) \ static int perf_perm_##name(struct trace_event_call *tp_event, \ struct perf_event *p_event) \ { \ return ({ expr; }); \ } \ static int __init trace_init_perf_perm_##name(void) \ { \ event_##name.perf_perm = &perf_perm_##name; \ return 0; \ } \ early_initcall(trace_init_perf_perm_##name); #define PERF_MAX_TRACE_SIZE 8192 #define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), ETT_TRACE_ONOFF = (1 << 0), ETT_SNAPSHOT = (1 << 1), ETT_STACKTRACE = (1 << 2), ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), ETT_HIST_ENABLE = (1 << 5), ETT_EVENT_EPROBE = (1 << 6), }; extern int filter_match_preds(struct event_filter *filter, void *rec); extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event); extern void event_triggers_post_call(struct trace_event_file *file, enum event_trigger_type tt); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); bool __trace_trigger_soft_disabled(struct trace_event_file *file); /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test * * If any triggers without filters are attached to this event, they * will be called here. If the event is soft disabled and has no * triggers that require testing the fields, it will return true, * otherwise false. */ static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_PID_FILTER)))) return false; if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) return false; return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); struct bpf_raw_tp_link; int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link); struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr, unsigned long *missed); int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { return 1; } static inline int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { return -EOPNOTSUPP; } static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } static inline int perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } struct bpf_raw_tp_link; static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return -EOPNOTSUPP; } static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return -EOPNOTSUPP; } static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { } static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr, unsigned long *missed) { return -EOPNOTSUPP; } static inline int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } static inline int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } #endif enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_RDYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, FILTER_CPUMASK, FILTER_COMM, FILTER_CPU, FILTER_STACKTRACE, }; extern int trace_event_raw_init(struct trace_event_call *call); extern int trace_define_field(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type); extern int trace_add_event_call(struct trace_event_call *call); extern int trace_remove_event_call(struct trace_event_call *call); extern int trace_event_get_offsets(struct trace_event_call *call); int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); int trace_set_clr_event(const char *system, const char *event, int set); int trace_array_set_clr_event(struct trace_array *tr, const char *system, const char *event, bool enable); /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a * constant. Even with the outer if statement optimizing out. */ #define event_trace_printk(ip, fmt, args...) \ do { \ __trace_printk_check_format(fmt, ##args); \ tracing_record_cmdline(current); \ if (__builtin_constant_p(fmt)) { \ static const char *trace_printk_fmt \ __section("__trace_printk_fmt") = \ __builtin_constant_p(fmt) ? fmt : NULL; \ \ __trace_bprintk(ip, trace_printk_fmt, ##args); \ } else \ __trace_printk(ip, fmt, ##args); \ } while (0) #ifdef CONFIG_PERF_EVENTS struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); extern int perf_trace_add(struct perf_event *event, int flags); extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); extern int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, const char **symbol, u64 *probe_offset, u64 *probe_addr, unsigned long *missed, bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, unsigned long ref_ctr_offset, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); extern int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, u64 *probe_offset, u64 *probe_addr, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie); void perf_event_free_bpf_prog(struct perf_event *event); void bpf_trace_run1(struct bpf_raw_tp_link *link, u64 arg1); void bpf_trace_run2(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2); void bpf_trace_run3(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3); void bpf_trace_run4(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4); void bpf_trace_run5(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); void bpf_trace_run6(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6); void bpf_trace_run7(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); void bpf_trace_run8(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8); void bpf_trace_run9(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9); void bpf_trace_run10(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10); void bpf_trace_run11(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11); void bpf_trace_run12(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task); static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, u64 count, struct pt_regs *regs, void *head, struct task_struct *task) { perf_tp_event(type, count, raw_data, size, regs, head, rctx, task); } #endif #define TRACE_EVENT_STR_MAX 512 /* * gcc warns that you can not use a va_list in an inlined * function. But lets me make it into a macro :-/ */ #define __trace_event_vstr_len(fmt, va) \ ({ \ va_list __ap; \ int __ret; \ \ va_copy(__ap, *(va)); \ __ret = vsnprintf(NULL, 0, fmt, __ap) + 1; \ va_end(__ap); \ \ min(__ret, TRACE_EVENT_STR_MAX); \ }) #endif /* _LINUX_TRACE_EVENT_H */ /* * Note: we keep the TRACE_CUSTOM_EVENT outside the include file ifdef protection. * This is due to the way trace custom events work. If a file includes two * trace event headers under one "CREATE_CUSTOM_TRACE_EVENTS" the first include * will override the TRACE_CUSTOM_EVENT and break the second include. */ #ifndef TRACE_CUSTOM_EVENT #define DECLARE_CUSTOM_EVENT_CLASS(name, proto, args, tstruct, assign, print) #define DEFINE_CUSTOM_EVENT(template, name, proto, args) #define TRACE_CUSTOM_EVENT(name, proto, args, struct, assign, print) #endif /* ifdef TRACE_CUSTOM_EVENT (see note above) */
2 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 // SPDX-License-Identifier: GPL-2.0+ /* * Driver for SanDisk SDDR-09 SmartMedia reader * * (c) 2000, 2001 Robert Baruch (autophile@starband.net) * (c) 2002 Andries Brouwer (aeb@cwi.nl) * Developed with the assistance of: * (c) 2002 Alan Stern <stern@rowland.org> * * The SanDisk SDDR-09 SmartMedia reader uses the Shuttle EUSB-01 chip. * This chip is a programmable USB controller. In the SDDR-09, it has * been programmed to obey a certain limited set of SCSI commands. * This driver translates the "real" SCSI commands to the SDDR-09 SCSI * commands. */ /* * Known vendor commands: 12 bytes, first byte is opcode * * E7: read scatter gather * E8: read * E9: write * EA: erase * EB: reset * EC: read status * ED: read ID * EE: write CIS (?) * EF: compute checksum (?) */ #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> #include "usb.h" #include "transport.h" #include "protocol.h" #include "debug.h" #include "scsiglue.h" #define DRV_NAME "ums-sddr09" MODULE_DESCRIPTION("Driver for SanDisk SDDR-09 SmartMedia reader"); MODULE_AUTHOR("Andries Brouwer <aeb@cwi.nl>, Robert Baruch <autophile@starband.net>"); MODULE_LICENSE("GPL"); MODULE_IMPORT_NS("USB_STORAGE"); static int usb_stor_sddr09_dpcm_init(struct us_data *us); static int sddr09_transport(struct scsi_cmnd *srb, struct us_data *us); static int usb_stor_sddr09_init(struct us_data *us); /* * The table of devices */ #define UNUSUAL_DEV(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax, \ vendorName, productName, useProtocol, useTransport, \ initFunction, flags) \ { USB_DEVICE_VER(id_vendor, id_product, bcdDeviceMin, bcdDeviceMax), \ .driver_info = (flags) } static const struct usb_device_id sddr09_usb_ids[] = { # include "unusual_sddr09.h" { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, sddr09_usb_ids); #undef UNUSUAL_DEV /* * The flags table */ #define UNUSUAL_DEV(idVendor, idProduct, bcdDeviceMin, bcdDeviceMax, \ vendor_name, product_name, use_protocol, use_transport, \ init_function, Flags) \ { \ .vendorName = vendor_name, \ .productName = product_name, \ .useProtocol = use_protocol, \ .useTransport = use_transport, \ .initFunction = init_function, \ } static const struct us_unusual_dev sddr09_unusual_dev_list[] = { # include "unusual_sddr09.h" { } /* Terminating entry */ }; #undef UNUSUAL_DEV #define short_pack(lsb,msb) ( ((u16)(lsb)) | ( ((u16)(msb))<<8 ) ) #define LSB_of(s) ((s)&0xFF) #define MSB_of(s) ((s)>>8) /* * First some stuff that does not belong here: * data on SmartMedia and other cards, completely * unrelated to this driver. * Similar stuff occurs in <linux/mtd/nand_ids.h>. */ struct nand_flash_dev { int model_id; int chipshift; /* 1<<cs bytes total capacity */ char pageshift; /* 1<<ps bytes in a page */ char blockshift; /* 1<<bs pages in an erase block */ char zoneshift; /* 1<<zs blocks in a zone */ /* # of logical blocks is 125/128 of this */ char pageadrlen; /* length of an address in bytes - 1 */ }; /* * NAND Flash Manufacturer ID Codes */ #define NAND_MFR_AMD 0x01 #define NAND_MFR_NATSEMI 0x8f #define NAND_MFR_TOSHIBA 0x98 #define NAND_MFR_SAMSUNG 0xec static inline char *nand_flash_manufacturer(int manuf_id) { switch(manuf_id) { case NAND_MFR_AMD: return "AMD"; case NAND_MFR_NATSEMI: return "NATSEMI"; case NAND_MFR_TOSHIBA: return "Toshiba"; case NAND_MFR_SAMSUNG: return "Samsung"; default: return "unknown"; } } /* * It looks like it is unnecessary to attach manufacturer to the * remaining data: SSFDC prescribes manufacturer-independent id codes. * * 256 MB NAND flash has a 5-byte ID with 2nd byte 0xaa, 0xba, 0xca or 0xda. */ static struct nand_flash_dev nand_flash_ids[] = { /* NAND flash */ { 0x6e, 20, 8, 4, 8, 2}, /* 1 MB */ { 0xe8, 20, 8, 4, 8, 2}, /* 1 MB */ { 0xec, 20, 8, 4, 8, 2}, /* 1 MB */ { 0x64, 21, 8, 4, 9, 2}, /* 2 MB */ { 0xea, 21, 8, 4, 9, 2}, /* 2 MB */ { 0x6b, 22, 9, 4, 9, 2}, /* 4 MB */ { 0xe3, 22, 9, 4, 9, 2}, /* 4 MB */ { 0xe5, 22, 9, 4, 9, 2}, /* 4 MB */ { 0xe6, 23, 9, 4, 10, 2}, /* 8 MB */ { 0x73, 24, 9, 5, 10, 2}, /* 16 MB */ { 0x75, 25, 9, 5, 10, 2}, /* 32 MB */ { 0x76, 26, 9, 5, 10, 3}, /* 64 MB */ { 0x79, 27, 9, 5, 10, 3}, /* 128 MB */ /* MASK ROM */ { 0x5d, 21, 9, 4, 8, 2}, /* 2 MB */ { 0xd5, 22, 9, 4, 9, 2}, /* 4 MB */ { 0xd6, 23, 9, 4, 10, 2}, /* 8 MB */ { 0x57, 24, 9, 4, 11, 2}, /* 16 MB */ { 0x58, 25, 9, 4, 12, 2}, /* 32 MB */ { 0,} }; static struct nand_flash_dev * nand_find_id(unsigned char id) { int i; for (i = 0; i < ARRAY_SIZE(nand_flash_ids); i++) if (nand_flash_ids[i].model_id == id) return &(nand_flash_ids[i]); return NULL; } /* * ECC computation. */ static unsigned char parity[256]; static unsigned char ecc2[256]; static void nand_init_ecc(void) { int i, j, a; parity[0] = 0; for (i = 1; i < 256; i++) parity[i] = (parity[i&(i-1)] ^ 1); for (i = 0; i < 256; i++) { a = 0; for (j = 0; j < 8; j++) { if (i & (1<<j)) { if ((j & 1) == 0) a ^= 0x04; if ((j & 2) == 0) a ^= 0x10; if ((j & 4) == 0) a ^= 0x40; } } ecc2[i] = ~(a ^ (a<<1) ^ (parity[i] ? 0xa8 : 0)); } } /* compute 3-byte ecc on 256 bytes */ static void nand_compute_ecc(unsigned char *data, unsigned char *ecc) { int i, j, a; unsigned char par = 0, bit, bits[8] = {0}; /* collect 16 checksum bits */ for (i = 0; i < 256; i++) { par ^= data[i]; bit = parity[data[i]]; for (j = 0; j < 8; j++) if ((i & (1<<j)) == 0) bits[j] ^= bit; } /* put 4+4+4 = 12 bits in the ecc */ a = (bits[3] << 6) + (bits[2] << 4) + (bits[1] << 2) + bits[0]; ecc[0] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0)); a = (bits[7] << 6) + (bits[6] << 4) + (bits[5] << 2) + bits[4]; ecc[1] = ~(a ^ (a<<1) ^ (parity[par] ? 0xaa : 0)); ecc[2] = ecc2[par]; } static int nand_compare_ecc(unsigned char *data, unsigned char *ecc) { return (data[0] == ecc[0] && data[1] == ecc[1] && data[2] == ecc[2]); } static void nand_store_ecc(unsigned char *data, unsigned char *ecc) { memcpy(data, ecc, 3); } /* * The actual driver starts here. */ struct sddr09_card_info { unsigned long capacity; /* Size of card in bytes */ int pagesize; /* Size of page in bytes */ int pageshift; /* log2 of pagesize */ int blocksize; /* Size of block in pages */ int blockshift; /* log2 of blocksize */ int blockmask; /* 2^blockshift - 1 */ int *lba_to_pba; /* logical to physical map */ int *pba_to_lba; /* physical to logical map */ int lbact; /* number of available pages */ int flags; #define SDDR09_WP 1 /* write protected */ }; /* * On my 16MB card, control blocks have size 64 (16 real control bytes, * and 48 junk bytes). In reality of course the card uses 16 control bytes, * so the reader makes up the remaining 48. Don't know whether these numbers * depend on the card. For now a constant. */ #define CONTROL_SHIFT 6 /* * On my Combo CF/SM reader, the SM reader has LUN 1. * (and things fail with LUN 0). * It seems LUN is irrelevant for others. */ #define LUN 1 #define LUNBITS (LUN << 5) /* * LBA and PBA are unsigned ints. Special values. */ #define UNDEF 0xffffffff #define SPARE 0xfffffffe #define UNUSABLE 0xfffffffd static const int erase_bad_lba_entries = 0; /* send vendor interface command (0x41) */ /* called for requests 0, 1, 8 */ static int sddr09_send_command(struct us_data *us, unsigned char request, unsigned char direction, unsigned char *xfer_data, unsigned int xfer_len) { unsigned int pipe; unsigned char requesttype = (0x41 | direction); int rc; // Get the receive or send control pipe number if (direction == USB_DIR_IN) pipe = us->recv_ctrl_pipe; else pipe = us->send_ctrl_pipe; rc = usb_stor_ctrl_transfer(us, pipe, request, requesttype, 0, 0, xfer_data, xfer_len); switch (rc) { case USB_STOR_XFER_GOOD: return 0; case USB_STOR_XFER_STALLED: return -EPIPE; default: return -EIO; } } static int sddr09_send_scsi_command(struct us_data *us, unsigned char *command, unsigned int command_len) { return sddr09_send_command(us, 0, USB_DIR_OUT, command, command_len); } #if 0 /* * Test Unit Ready Command: 12 bytes. * byte 0: opcode: 00 */ static int sddr09_test_unit_ready(struct us_data *us) { unsigned char *command = us->iobuf; int result; memset(command, 0, 6); command[1] = LUNBITS; result = sddr09_send_scsi_command(us, command, 6); usb_stor_dbg(us, "sddr09_test_unit_ready returns %d\n", result); return result; } #endif /* * Request Sense Command: 12 bytes. * byte 0: opcode: 03 * byte 4: data length */ static int sddr09_request_sense(struct us_data *us, unsigned char *sensebuf, int buflen) { unsigned char *command = us->iobuf; int result; memset(command, 0, 12); command[0] = 0x03; command[1] = LUNBITS; command[4] = buflen; result = sddr09_send_scsi_command(us, command, 12); if (result) return result; result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, sensebuf, buflen, NULL); return (result == USB_STOR_XFER_GOOD ? 0 : -EIO); } /* * Read Command: 12 bytes. * byte 0: opcode: E8 * byte 1: last two bits: 00: read data, 01: read blockwise control, * 10: read both, 11: read pagewise control. * It turns out we need values 20, 21, 22, 23 here (LUN 1). * bytes 2-5: address (interpretation depends on byte 1, see below) * bytes 10-11: count (idem) * * A page has 512 data bytes and 64 control bytes (16 control and 48 junk). * A read data command gets data in 512-byte pages. * A read control command gets control in 64-byte chunks. * A read both command gets data+control in 576-byte chunks. * * Blocks are groups of 32 pages, and read blockwise control jumps to the * next block, while read pagewise control jumps to the next page after * reading a group of 64 control bytes. * [Here 512 = 1<<pageshift, 32 = 1<<blockshift, 64 is constant?] * * (1 MB and 2 MB cards are a bit different, but I have only a 16 MB card.) */ static int sddr09_readX(struct us_data *us, int x, unsigned long fromaddress, int nr_of_pages, int bulklen, unsigned char *buf, int use_sg) { unsigned char *command = us->iobuf; int result; command[0] = 0xE8; command[1] = LUNBITS | x; command[2] = MSB_of(fromaddress>>16); command[3] = LSB_of(fromaddress>>16); command[4] = MSB_of(fromaddress & 0xFFFF); command[5] = LSB_of(fromaddress & 0xFFFF); command[6] = 0; command[7] = 0; command[8] = 0; command[9] = 0; command[10] = MSB_of(nr_of_pages); command[11] = LSB_of(nr_of_pages); result = sddr09_send_scsi_command(us, command, 12); if (result) { usb_stor_dbg(us, "Result for send_control in sddr09_read2%d %d\n", x, result); return result; } result = usb_stor_bulk_transfer_sg(us, us->recv_bulk_pipe, buf, bulklen, use_sg, NULL); if (result != USB_STOR_XFER_GOOD) { usb_stor_dbg(us, "Result for bulk_transfer in sddr09_read2%d %d\n", x, result); return -EIO; } return 0; } /* * Read Data * * fromaddress counts data shorts: * increasing it by 256 shifts the bytestream by 512 bytes; * the last 8 bits are ignored. * * nr_of_pages counts pages of size (1 << pageshift). */ static int sddr09_read20(struct us_data *us, unsigned long fromaddress, int nr_of_pages, int pageshift, unsigned char *buf, int use_sg) { int bulklen = nr_of_pages << pageshift; /* The last 8 bits of fromaddress are ignored. */ return sddr09_readX(us, 0, fromaddress, nr_of_pages, bulklen, buf, use_sg); } /* * Read Blockwise Control * * fromaddress gives the starting position (as in read data; * the last 8 bits are ignored); increasing it by 32*256 shifts * the output stream by 64 bytes. * * count counts control groups of size (1 << controlshift). * For me, controlshift = 6. Is this constant? * * After getting one control group, jump to the next block * (fromaddress += 8192). */ static int sddr09_read21(struct us_data *us, unsigned long fromaddress, int count, int controlshift, unsigned char *buf, int use_sg) { int bulklen = (count << controlshift); return sddr09_readX(us, 1, fromaddress, count, bulklen, buf, use_sg); } /* * Read both Data and Control * * fromaddress counts data shorts, ignoring control: * increasing it by 256 shifts the bytestream by 576 = 512+64 bytes; * the last 8 bits are ignored. * * nr_of_pages counts pages of size (1 << pageshift) + (1 << controlshift). */ static int sddr09_read22(struct us_data *us, unsigned long fromaddress, int nr_of_pages, int pageshift, unsigned char *buf, int use_sg) { int bulklen = (nr_of_pages << pageshift) + (nr_of_pages << CONTROL_SHIFT); usb_stor_dbg(us, "reading %d pages, %d bytes\n", nr_of_pages, bulklen); return sddr09_readX(us, 2, fromaddress, nr_of_pages, bulklen, buf, use_sg); } #if 0 /* * Read Pagewise Control * * fromaddress gives the starting position (as in read data; * the last 8 bits are ignored); increasing it by 256 shifts * the output stream by 64 bytes. * * count counts control groups of size (1 << controlshift). * For me, controlshift = 6. Is this constant? * * After getting one control group, jump to the next page * (fromaddress += 256). */ static int sddr09_read23(struct us_data *us, unsigned long fromaddress, int count, int controlshift, unsigned char *buf, int use_sg) { int bulklen = (count << controlshift); return sddr09_readX(us, 3, fromaddress, count, bulklen, buf, use_sg); } #endif /* * Erase Command: 12 bytes. * byte 0: opcode: EA * bytes 6-9: erase address (big-endian, counting shorts, sector aligned). * * Always precisely one block is erased; bytes 2-5 and 10-11 are ignored. * The byte address being erased is 2*Eaddress. * The CIS cannot be erased. */ static int sddr09_erase(struct us_data *us, unsigned long Eaddress) { unsigned char *command = us->iobuf; int result; usb_stor_dbg(us, "erase address %lu\n", Eaddress); memset(command, 0, 12); command[0] = 0xEA; command[1] = LUNBITS; command[6] = MSB_of(Eaddress>>16); command[7] = LSB_of(Eaddress>>16); command[8] = MSB_of(Eaddress & 0xFFFF); command[9] = LSB_of(Eaddress & 0xFFFF); result = sddr09_send_scsi_command(us, command, 12); if (result) usb_stor_dbg(us, "Result for send_control in sddr09_erase %d\n", result); return result; } /* * Write CIS Command: 12 bytes. * byte 0: opcode: EE * bytes 2-5: write address in shorts * bytes 10-11: sector count * * This writes at the indicated address. Don't know how it differs * from E9. Maybe it does not erase? However, it will also write to * the CIS. * * When two such commands on the same page follow each other directly, * the second one is not done. */ /* * Write Command: 12 bytes. * byte 0: opcode: E9 * bytes 2-5: write address (big-endian, counting shorts, sector aligned). * bytes 6-9: erase address (big-endian, counting shorts, sector aligned). * bytes 10-11: sector count (big-endian, in 512-byte sectors). * * If write address equals erase address, the erase is done first, * otherwise the write is done first. When erase address equals zero * no erase is done? */ static int sddr09_writeX(struct us_data *us, unsigned long Waddress, unsigned long Eaddress, int nr_of_pages, int bulklen, unsigned char *buf, int use_sg) { unsigned char *command = us->iobuf; int result; command[0] = 0xE9; command[1] = LUNBITS; command[2] = MSB_of(Waddress>>16); command[3] = LSB_of(Waddress>>16); command[4] = MSB_of(Waddress & 0xFFFF); command[5] = LSB_of(Waddress & 0xFFFF); command[6] = MSB_of(Eaddress>>16); command[7] = LSB_of(Eaddress>>16); command[8] = MSB_of(Eaddress & 0xFFFF); command[9] = LSB_of(Eaddress & 0xFFFF); command[10] = MSB_of(nr_of_pages); command[11] = LSB_of(nr_of_pages); result = sddr09_send_scsi_command(us, command, 12); if (result) { usb_stor_dbg(us, "Result for send_control in sddr09_writeX %d\n", result); return result; } result = usb_stor_bulk_transfer_sg(us, us->send_bulk_pipe, buf, bulklen, use_sg, NULL); if (result != USB_STOR_XFER_GOOD) { usb_stor_dbg(us, "Result for bulk_transfer in sddr09_writeX %d\n", result); return -EIO; } return 0; } /* erase address, write same address */ static int sddr09_write_inplace(struct us_data *us, unsigned long address, int nr_of_pages, int pageshift, unsigned char *buf, int use_sg) { int bulklen = (nr_of_pages << pageshift) + (nr_of_pages << CONTROL_SHIFT); return sddr09_writeX(us, address, address, nr_of_pages, bulklen, buf, use_sg); } #if 0 /* * Read Scatter Gather Command: 3+4n bytes. * byte 0: opcode E7 * byte 2: n * bytes 4i-1,4i,4i+1: page address * byte 4i+2: page count * (i=1..n) * * This reads several pages from the card to a single memory buffer. * The last two bits of byte 1 have the same meaning as for E8. */ static int sddr09_read_sg_test_only(struct us_data *us) { unsigned char *command = us->iobuf; int result, bulklen, nsg, ct; unsigned char *buf; unsigned long address; nsg = bulklen = 0; command[0] = 0xE7; command[1] = LUNBITS; command[2] = 0; address = 040000; ct = 1; nsg++; bulklen += (ct << 9); command[4*nsg+2] = ct; command[4*nsg+1] = ((address >> 9) & 0xFF); command[4*nsg+0] = ((address >> 17) & 0xFF); command[4*nsg-1] = ((address >> 25) & 0xFF); address = 0340000; ct = 1; nsg++; bulklen += (ct << 9); command[4*nsg+2] = ct; command[4*nsg+1] = ((address >> 9) & 0xFF); command[4*nsg+0] = ((address >> 17) & 0xFF); command[4*nsg-1] = ((address >> 25) & 0xFF); address = 01000000; ct = 2; nsg++; bulklen += (ct << 9); command[4*nsg+2] = ct; command[4*nsg+1] = ((address >> 9) & 0xFF); command[4*nsg+0] = ((address >> 17) & 0xFF); command[4*nsg-1] = ((address >> 25) & 0xFF); command[2] = nsg; result = sddr09_send_scsi_command(us, command, 4*nsg+3); if (result) { usb_stor_dbg(us, "Result for send_control in sddr09_read_sg %d\n", result); return result; } buf = kmalloc(bulklen, GFP_NOIO); if (!buf) return -ENOMEM; result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, buf, bulklen, NULL); kfree(buf); if (result != USB_STOR_XFER_GOOD) { usb_stor_dbg(us, "Result for bulk_transfer in sddr09_read_sg %d\n", result); return -EIO; } return 0; } #endif /* * Read Status Command: 12 bytes. * byte 0: opcode: EC * * Returns 64 bytes, all zero except for the first. * bit 0: 1: Error * bit 5: 1: Suspended * bit 6: 1: Ready * bit 7: 1: Not write-protected */ static int sddr09_read_status(struct us_data *us, unsigned char *status) { unsigned char *command = us->iobuf; unsigned char *data = us->iobuf; int result; usb_stor_dbg(us, "Reading status...\n"); memset(command, 0, 12); command[0] = 0xEC; command[1] = LUNBITS; result = sddr09_send_scsi_command(us, command, 12); if (result) return result; result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, data, 64, NULL); *status = data[0]; return (result == USB_STOR_XFER_GOOD ? 0 : -EIO); } static int sddr09_read_data(struct us_data *us, unsigned long address, unsigned int sectors) { struct sddr09_card_info *info = (struct sddr09_card_info *) us->extra; unsigned char *buffer; unsigned int lba, maxlba, pba; unsigned int page, pages; unsigned int len, offset; struct scatterlist *sg; int result; // Figure out the initial LBA and page lba = address >> info->blockshift; page = (address & info->blockmask); maxlba = info->capacity >> (info->pageshift + info->blockshift); if (lba >= maxlba) return -EIO; // Since we only read in one block at a time, we have to create // a bounce buffer and move the data a piece at a time between the // bounce buffer and the actual transfer buffer. len = min_t(unsigned int, sectors, info->blocksize) * info->pagesize; buffer = kmalloc(len, GFP_NOIO); if (!buffer) return -ENOMEM; // This could be made much more efficient by checking for // contiguous LBA's. Another exercise left to the student. result = 0; offset = 0; sg = NULL; while (sectors > 0) { /* Find number of pages we can read in this block */ pages = min(sectors, info->blocksize - page); len = pages << info->pageshift; /* Not overflowing capacity? */ if (lba >= maxlba) { usb_stor_dbg(us, "Error: Requested lba %u exceeds maximum %u\n", lba, maxlba); result = -EIO; break; } /* Find where this lba lives on disk */ pba = info->lba_to_pba[lba]; if (pba == UNDEF) { /* this lba was never written */ usb_stor_dbg(us, "Read %d zero pages (LBA %d) page %d\n", pages, lba, page); /* * This is not really an error. It just means * that the block has never been written. * Instead of returning an error * it is better to return all zero data. */ memset(buffer, 0, len); } else { usb_stor_dbg(us, "Read %d pages, from PBA %d (LBA %d) page %d\n", pages, pba, lba, page); address = ((pba << info->blockshift) + page) << info->pageshift; result = sddr09_read20(us, address>>1, pages, info->pageshift, buffer, 0); if (result) break; } // Store the data in the transfer buffer usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &offset, TO_XFER_BUF); page = 0; lba++; sectors -= pages; } kfree(buffer); return result; } static unsigned int sddr09_find_unused_pba(struct sddr09_card_info *info, unsigned int lba) { static unsigned int lastpba = 1; int zonestart, end, i; zonestart = (lba/1000) << 10; end = info->capacity >> (info->blockshift + info->pageshift); end -= zonestart; if (end > 1024) end = 1024; for (i = lastpba+1; i < end; i++) { if (info->pba_to_lba[zonestart+i] == UNDEF) { lastpba = i; return zonestart+i; } } for (i = 0; i <= lastpba; i++) { if (info->pba_to_lba[zonestart+i] == UNDEF) { lastpba = i; return zonestart+i; } } return 0; } static int sddr09_write_lba(struct us_data *us, unsigned int lba, unsigned int page, unsigned int pages, unsigned char *ptr, unsigned char *blockbuffer) { struct sddr09_card_info *info = (struct sddr09_card_info *) us->extra; unsigned long address; unsigned int pba, lbap; unsigned int pagelen; unsigned char *bptr, *cptr, *xptr; unsigned char ecc[3]; int i, result; lbap = ((lba % 1000) << 1) | 0x1000; if (parity[MSB_of(lbap) ^ LSB_of(lbap)]) lbap ^= 1; pba = info->lba_to_pba[lba]; if (pba == UNDEF) { pba = sddr09_find_unused_pba(info, lba); if (!pba) { printk(KERN_WARNING "sddr09_write_lba: Out of unused blocks\n"); return -ENOSPC; } info->pba_to_lba[pba] = lba; info->lba_to_pba[lba] = pba; } if (pba == 1) { /* * Maybe it is impossible to write to PBA 1. * Fake success, but don't do anything. */ printk(KERN_WARNING "sddr09: avoid writing to pba 1\n"); return 0; } pagelen = (1 << info->pageshift) + (1 << CONTROL_SHIFT); /* read old contents */ address = (pba << (info->pageshift + info->blockshift)); result = sddr09_read22(us, address>>1, info->blocksize, info->pageshift, blockbuffer, 0); if (result) return result; /* check old contents and fill lba */ for (i = 0; i < info->blocksize; i++) { bptr = blockbuffer + i*pagelen; cptr = bptr + info->pagesize; nand_compute_ecc(bptr, ecc); if (!nand_compare_ecc(cptr+13, ecc)) { usb_stor_dbg(us, "Warning: bad ecc in page %d- of pba %d\n", i, pba); nand_store_ecc(cptr+13, ecc); } nand_compute_ecc(bptr+(info->pagesize / 2), ecc); if (!nand_compare_ecc(cptr+8, ecc)) { usb_stor_dbg(us, "Warning: bad ecc in page %d+ of pba %d\n", i, pba); nand_store_ecc(cptr+8, ecc); } cptr[6] = cptr[11] = MSB_of(lbap); cptr[7] = cptr[12] = LSB_of(lbap); } /* copy in new stuff and compute ECC */ xptr = ptr; for (i = page; i < page+pages; i++) { bptr = blockbuffer + i*pagelen; cptr = bptr + info->pagesize; memcpy(bptr, xptr, info->pagesize); xptr += info->pagesize; nand_compute_ecc(bptr, ecc); nand_store_ecc(cptr+13, ecc); nand_compute_ecc(bptr+(info->pagesize / 2), ecc); nand_store_ecc(cptr+8, ecc); } usb_stor_dbg(us, "Rewrite PBA %d (LBA %d)\n", pba, lba); result = sddr09_write_inplace(us, address>>1, info->blocksize, info->pageshift, blockbuffer, 0); usb_stor_dbg(us, "sddr09_write_inplace returns %d\n", result); #if 0 { unsigned char status = 0; int result2 = sddr09_read_status(us, &status); if (result2) usb_stor_dbg(us, "cannot read status\n"); else if (status != 0xc0) usb_stor_dbg(us, "status after write: 0x%x\n", status); } #endif #if 0 { int result2 = sddr09_test_unit_ready(us); } #endif return result; } static int sddr09_write_data(struct us_data *us, unsigned long address, unsigned int sectors) { struct sddr09_card_info *info = (struct sddr09_card_info *) us->extra; unsigned int lba, maxlba, page, pages; unsigned int pagelen, blocklen; unsigned char *blockbuffer; unsigned char *buffer; unsigned int len, offset; struct scatterlist *sg; int result; /* Figure out the initial LBA and page */ lba = address >> info->blockshift; page = (address & info->blockmask); maxlba = info->capacity >> (info->pageshift + info->blockshift); if (lba >= maxlba) return -EIO; /* * blockbuffer is used for reading in the old data, overwriting * with the new data, and performing ECC calculations */ /* * TODO: instead of doing kmalloc/kfree for each write, * add a bufferpointer to the info structure */ pagelen = (1 << info->pageshift) + (1 << CONTROL_SHIFT); blocklen = (pagelen << info->blockshift); blockbuffer = kmalloc(blocklen, GFP_NOIO); if (!blockbuffer) return -ENOMEM; /* * Since we don't write the user data directly to the device, * we have to create a bounce buffer and move the data a piece * at a time between the bounce buffer and the actual transfer buffer. */ len = min_t(unsigned int, sectors, info->blocksize) * info->pagesize; buffer = kmalloc(len, GFP_NOIO); if (!buffer) { kfree(blockbuffer); return -ENOMEM; } result = 0; offset = 0; sg = NULL; while (sectors > 0) { /* Write as many sectors as possible in this block */ pages = min(sectors, info->blocksize - page); len = (pages << info->pageshift); /* Not overflowing capacity? */ if (lba >= maxlba) { usb_stor_dbg(us, "Error: Requested lba %u exceeds maximum %u\n", lba, maxlba); result = -EIO; break; } /* Get the data from the transfer buffer */ usb_stor_access_xfer_buf(buffer, len, us->srb, &sg, &offset, FROM_XFER_BUF); result = sddr09_write_lba(us, lba, page, pages, buffer, blockbuffer); if (result) break; page = 0; lba++; sectors -= pages; } kfree(buffer); kfree(blockbuffer); return result; } static int sddr09_read_control(struct us_data *us, unsigned long address, unsigned int blocks, unsigned char *content, int use_sg) { usb_stor_dbg(us, "Read control address %lu, blocks %d\n", address, blocks); return sddr09_read21(us, address, blocks, CONTROL_SHIFT, content, use_sg); } /* * Read Device ID Command: 12 bytes. * byte 0: opcode: ED * * Returns 2 bytes: Manufacturer ID and Device ID. * On more recent cards 3 bytes: the third byte is an option code A5 * signifying that the secret command to read an 128-bit ID is available. * On still more recent cards 4 bytes: the fourth byte C0 means that * a second read ID cmd is available. */ static int sddr09_read_deviceID(struct us_data *us, unsigned char *deviceID) { unsigned char *command = us->iobuf; unsigned char *content = us->iobuf; int result, i; memset(command, 0, 12); command[0] = 0xED; command[1] = LUNBITS; result = sddr09_send_scsi_command(us, command, 12); if (result) return result; result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, content, 64, NULL); for (i = 0; i < 4; i++) deviceID[i] = content[i]; return (result == USB_STOR_XFER_GOOD ? 0 : -EIO); } static int sddr09_get_wp(struct us_data *us, struct sddr09_card_info *info) { int result; unsigned char status; const char *wp_fmt; result = sddr09_read_status(us, &status); if (result) { usb_stor_dbg(us, "read_status fails\n"); return result; } if ((status & 0x80) == 0) { info->flags |= SDDR09_WP; /* write protected */ wp_fmt = " WP"; } else { wp_fmt = ""; } usb_stor_dbg(us, "status 0x%02X%s%s%s%s\n", status, wp_fmt, status & 0x40 ? " Ready" : "", status & LUNBITS ? " Suspended" : "", status & 0x01 ? " Error" : ""); return 0; } #if 0 /* * Reset Command: 12 bytes. * byte 0: opcode: EB */ static int sddr09_reset(struct us_data *us) { unsigned char *command = us->iobuf; memset(command, 0, 12); command[0] = 0xEB; command[1] = LUNBITS; return sddr09_send_scsi_command(us, command, 12); } #endif static struct nand_flash_dev * sddr09_get_cardinfo(struct us_data *us, unsigned char flags) { struct nand_flash_dev *cardinfo; unsigned char deviceID[4]; char blurbtxt[256]; int result; usb_stor_dbg(us, "Reading capacity...\n"); result = sddr09_read_deviceID(us, deviceID); if (result) { usb_stor_dbg(us, "Result of read_deviceID is %d\n", result); printk(KERN_WARNING "sddr09: could not read card info\n"); return NULL; } sprintf(blurbtxt, "sddr09: Found Flash card, ID = %4ph", deviceID); /* Byte 0 is the manufacturer */ sprintf(blurbtxt + strlen(blurbtxt), ": Manuf. %s", nand_flash_manufacturer(deviceID[0])); /* Byte 1 is the device type */ cardinfo = nand_find_id(deviceID[1]); if (cardinfo) { /* * MB or MiB? It is neither. A 16 MB card has * 17301504 raw bytes, of which 16384000 are * usable for user data. */ sprintf(blurbtxt + strlen(blurbtxt), ", %d MB", 1<<(cardinfo->chipshift - 20)); } else { sprintf(blurbtxt + strlen(blurbtxt), ", type unrecognized"); } /* Byte 2 is code to signal availability of 128-bit ID */ if (deviceID[2] == 0xa5) { sprintf(blurbtxt + strlen(blurbtxt), ", 128-bit ID"); } /* Byte 3 announces the availability of another read ID command */ if (deviceID[3] == 0xc0) { sprintf(blurbtxt + strlen(blurbtxt), ", extra cmd"); } if (flags & SDDR09_WP) sprintf(blurbtxt + strlen(blurbtxt), ", WP"); printk(KERN_WARNING "%s\n", blurbtxt); return cardinfo; } static int sddr09_read_map(struct us_data *us) { struct sddr09_card_info *info = (struct sddr09_card_info *) us->extra; int numblocks, alloc_len, alloc_blocks; int i, j, result; unsigned char *buffer, *buffer_end, *ptr; unsigned int lba, lbact; if (!info->capacity) return -1; /* * size of a block is 1 << (blockshift + pageshift) bytes * divide into the total capacity to get the number of blocks */ numblocks = info->capacity >> (info->blockshift + info->pageshift); /* * read 64 bytes for every block (actually 1 << CONTROL_SHIFT) * but only use a 64 KB buffer * buffer size used must be a multiple of (1 << CONTROL_SHIFT) */ #define SDDR09_READ_MAP_BUFSZ 65536 alloc_blocks = min(numblocks, SDDR09_READ_MAP_BUFSZ >> CONTROL_SHIFT); alloc_len = (alloc_blocks << CONTROL_SHIFT); buffer = kmalloc(alloc_len, GFP_NOIO); if (!buffer) { result = -1; goto done; } buffer_end = buffer + alloc_len; #undef SDDR09_READ_MAP_BUFSZ kfree(info->lba_to_pba); kfree(info->pba_to_lba); info->lba_to_pba = kmalloc_array(numblocks, sizeof(int), GFP_NOIO); info->pba_to_lba = kmalloc_array(numblocks, sizeof(int), GFP_NOIO); if (info->lba_to_pba == NULL || info->pba_to_lba == NULL) { printk(KERN_WARNING "sddr09_read_map: out of memory\n"); result = -1; goto done; } for (i = 0; i < numblocks; i++) info->lba_to_pba[i] = info->pba_to_lba[i] = UNDEF; /* * Define lba-pba translation table */ ptr = buffer_end; for (i = 0; i < numblocks; i++) { ptr += (1 << CONTROL_SHIFT); if (ptr >= buffer_end) { unsigned long address; address = i << (info->pageshift + info->blockshift); result = sddr09_read_control( us, address>>1, min(alloc_blocks, numblocks - i), buffer, 0); if (result) { result = -1; goto done; } ptr = buffer; } if (i == 0 || i == 1) { info->pba_to_lba[i] = UNUSABLE; continue; } /* special PBAs have control field 0^16 */ for (j = 0; j < 16; j++) if (ptr[j] != 0) goto nonz; info->pba_to_lba[i] = UNUSABLE; printk(KERN_WARNING "sddr09: PBA %d has no logical mapping\n", i); continue; nonz: /* unwritten PBAs have control field FF^16 */ for (j = 0; j < 16; j++) if (ptr[j] != 0xff) goto nonff; continue; nonff: /* normal PBAs start with six FFs */ if (j < 6) { printk(KERN_WARNING "sddr09: PBA %d has no logical mapping: " "reserved area = %02X%02X%02X%02X " "data status %02X block status %02X\n", i, ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5]); info->pba_to_lba[i] = UNUSABLE; continue; } if ((ptr[6] >> 4) != 0x01) { printk(KERN_WARNING "sddr09: PBA %d has invalid address field " "%02X%02X/%02X%02X\n", i, ptr[6], ptr[7], ptr[11], ptr[12]); info->pba_to_lba[i] = UNUSABLE; continue; } /* check even parity */ if (parity[ptr[6] ^ ptr[7]]) { printk(KERN_WARNING "sddr09: Bad parity in LBA for block %d" " (%02X %02X)\n", i, ptr[6], ptr[7]); info->pba_to_lba[i] = UNUSABLE; continue; } lba = short_pack(ptr[7], ptr[6]); lba = (lba & 0x07FF) >> 1; /* * Every 1024 physical blocks ("zone"), the LBA numbers * go back to zero, but are within a higher block of LBA's. * Also, there is a maximum of 1000 LBA's per zone. * In other words, in PBA 1024-2047 you will find LBA 0-999 * which are really LBA 1000-1999. This allows for 24 bad * or special physical blocks per zone. */ if (lba >= 1000) { printk(KERN_WARNING "sddr09: Bad low LBA %d for block %d\n", lba, i); goto possibly_erase; } lba += 1000*(i/0x400); if (info->lba_to_pba[lba] != UNDEF) { printk(KERN_WARNING "sddr09: LBA %d seen for PBA %d and %d\n", lba, info->lba_to_pba[lba], i); goto possibly_erase; } info->pba_to_lba[i] = lba; info->lba_to_pba[lba] = i; continue; possibly_erase: if (erase_bad_lba_entries) { unsigned long address; address = (i << (info->pageshift + info->blockshift)); sddr09_erase(us, address>>1); info->pba_to_lba[i] = UNDEF; } else info->pba_to_lba[i] = UNUSABLE; } /* * Approximate capacity. This is not entirely correct yet, * since a zone with less than 1000 usable pages leads to * missing LBAs. Especially if it is the last zone, some * LBAs can be past capacity. */ lbact = 0; for (i = 0; i < numblocks; i += 1024) { int ct = 0; for (j = 0; j < 1024 && i+j < numblocks; j++) { if (info->pba_to_lba[i+j] != UNUSABLE) { if (ct >= 1000) info->pba_to_lba[i+j] = SPARE; else ct++; } } lbact += ct; } info->lbact = lbact; usb_stor_dbg(us, "Found %d LBA's\n", lbact); result = 0; done: if (result != 0) { kfree(info->lba_to_pba); kfree(info->pba_to_lba); info->lba_to_pba = NULL; info->pba_to_lba = NULL; } kfree(buffer); return result; } static void sddr09_card_info_destructor(void *extra) { struct sddr09_card_info *info = (struct sddr09_card_info *)extra; if (!info) return; kfree(info->lba_to_pba); kfree(info->pba_to_lba); } static int sddr09_common_init(struct us_data *us) { int result; /* set the configuration -- STALL is an acceptable response here */ if (us->pusb_dev->actconfig->desc.bConfigurationValue != 1) { usb_stor_dbg(us, "active config #%d != 1 ??\n", us->pusb_dev->actconfig->desc.bConfigurationValue); return -EINVAL; } result = usb_reset_configuration(us->pusb_dev); usb_stor_dbg(us, "Result of usb_reset_configuration is %d\n", result); if (result == -EPIPE) { usb_stor_dbg(us, "-- stall on control interface\n"); } else if (result != 0) { /* it's not a stall, but another error -- time to bail */ usb_stor_dbg(us, "-- Unknown error. Rejecting device\n"); return -EINVAL; } us->extra = kzalloc(sizeof(struct sddr09_card_info), GFP_NOIO); if (!us->extra) return -ENOMEM; us->extra_destructor = sddr09_card_info_destructor; nand_init_ecc(); return 0; } /* * This is needed at a very early stage. If this is not listed in the * unusual devices list but called from here then LUN 0 of the combo reader * is not recognized. But I do not know what precisely these calls do. */ static int usb_stor_sddr09_dpcm_init(struct us_data *us) { int result; unsigned char *data = us->iobuf; result = sddr09_common_init(us); if (result) return result; result = sddr09_send_command(us, 0x01, USB_DIR_IN, data, 2); if (result) { usb_stor_dbg(us, "send_command fails\n"); return result; } usb_stor_dbg(us, "%02X %02X\n", data[0], data[1]); // get 07 02 result = sddr09_send_command(us, 0x08, USB_DIR_IN, data, 2); if (result) { usb_stor_dbg(us, "2nd send_command fails\n"); return result; } usb_stor_dbg(us, "%02X %02X\n", data[0], data[1]); // get 07 00 result = sddr09_request_sense(us, data, 18); if (result == 0 && data[2] != 0) { int j; for (j=0; j<18; j++) printk(" %02X", data[j]); printk("\n"); // get 70 00 00 00 00 00 00 * 00 00 00 00 00 00 // 70: current command // sense key 0, sense code 0, extd sense code 0 // additional transfer length * = sizeof(data) - 7 // Or: 70 00 06 00 00 00 00 0b 00 00 00 00 28 00 00 00 00 00 // sense key 06, sense code 28: unit attention, // not ready to ready transition } // test unit ready return 0; /* not result */ } /* * Transport for the Microtech DPCM-USB */ static int dpcm_transport(struct scsi_cmnd *srb, struct us_data *us) { int ret; usb_stor_dbg(us, "LUN=%d\n", (u8)srb->device->lun); switch (srb->device->lun) { case 0: /* * LUN 0 corresponds to the CompactFlash card reader. */ ret = usb_stor_CB_transport(srb, us); break; case 1: /* * LUN 1 corresponds to the SmartMedia card reader. */ /* * Set the LUN to 0 (just in case). */ srb->device->lun = 0; ret = sddr09_transport(srb, us); srb->device->lun = 1; break; default: usb_stor_dbg(us, "Invalid LUN %d\n", (u8)srb->device->lun); ret = USB_STOR_TRANSPORT_ERROR; break; } return ret; } /* * Transport for the Sandisk SDDR-09 */ static int sddr09_transport(struct scsi_cmnd *srb, struct us_data *us) { static unsigned char sensekey = 0, sensecode = 0; static unsigned char havefakesense = 0; int result, i; unsigned char *ptr = us->iobuf; unsigned long capacity; unsigned int page, pages; struct sddr09_card_info *info; static unsigned char inquiry_response[8] = { 0x00, 0x80, 0x00, 0x02, 0x1F, 0x00, 0x00, 0x00 }; /* note: no block descriptor support */ static unsigned char mode_page_01[19] = { 0x00, 0x0F, 0x00, 0x0, 0x0, 0x0, 0x00, 0x01, 0x0A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; info = (struct sddr09_card_info *)us->extra; if (srb->cmnd[0] == REQUEST_SENSE && havefakesense) { /* for a faked command, we have to follow with a faked sense */ memset(ptr, 0, 18); ptr[0] = 0x70; ptr[2] = sensekey; ptr[7] = 11; ptr[12] = sensecode; usb_stor_set_xfer_buf(ptr, 18, srb); sensekey = sensecode = havefakesense = 0; return USB_STOR_TRANSPORT_GOOD; } havefakesense = 1; /* * Dummy up a response for INQUIRY since SDDR09 doesn't * respond to INQUIRY commands */ if (srb->cmnd[0] == INQUIRY) { memcpy(ptr, inquiry_response, 8); fill_inquiry_response(us, ptr, 36); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == READ_CAPACITY) { struct nand_flash_dev *cardinfo; sddr09_get_wp(us, info); /* read WP bit */ cardinfo = sddr09_get_cardinfo(us, info->flags); if (!cardinfo) { /* probably no media */ init_error: sensekey = 0x02; /* not ready */ sensecode = 0x3a; /* medium not present */ return USB_STOR_TRANSPORT_FAILED; } info->capacity = (1 << cardinfo->chipshift); info->pageshift = cardinfo->pageshift; info->pagesize = (1 << info->pageshift); info->blockshift = cardinfo->blockshift; info->blocksize = (1 << info->blockshift); info->blockmask = info->blocksize - 1; // map initialization, must follow get_cardinfo() if (sddr09_read_map(us)) { /* probably out of memory */ goto init_error; } // Report capacity capacity = (info->lbact << info->blockshift) - 1; ((__be32 *) ptr)[0] = cpu_to_be32(capacity); // Report page size ((__be32 *) ptr)[1] = cpu_to_be32(info->pagesize); usb_stor_set_xfer_buf(ptr, 8, srb); return USB_STOR_TRANSPORT_GOOD; } if (srb->cmnd[0] == MODE_SENSE_10) { int modepage = (srb->cmnd[2] & 0x3F); /* * They ask for the Read/Write error recovery page, * or for all pages. */ /* %% We should check DBD %% */ if (modepage == 0x01 || modepage == 0x3F) { usb_stor_dbg(us, "Dummy up request for mode page 0x%x\n", modepage); memcpy(ptr, mode_page_01, sizeof(mode_page_01)); ((__be16*)ptr)[0] = cpu_to_be16(sizeof(mode_page_01) - 2); ptr[3] = (info->flags & SDDR09_WP) ? 0x80 : 0; usb_stor_set_xfer_buf(ptr, sizeof(mode_page_01), srb); return USB_STOR_TRANSPORT_GOOD; } sensekey = 0x05; /* illegal request */ sensecode = 0x24; /* invalid field in CDB */ return USB_STOR_TRANSPORT_FAILED; } if (srb->cmnd[0] == ALLOW_MEDIUM_REMOVAL) return USB_STOR_TRANSPORT_GOOD; havefakesense = 0; if (srb->cmnd[0] == READ_10) { page = short_pack(srb->cmnd[3], srb->cmnd[2]); page <<= 16; page |= short_pack(srb->cmnd[5], srb->cmnd[4]); pages = short_pack(srb->cmnd[8], srb->cmnd[7]); usb_stor_dbg(us, "READ_10: read page %d pagect %d\n", page, pages); result = sddr09_read_data(us, page, pages); return (result == 0 ? USB_STOR_TRANSPORT_GOOD : USB_STOR_TRANSPORT_ERROR); } if (srb->cmnd[0] == WRITE_10) { page = short_pack(srb->cmnd[3], srb->cmnd[2]); page <<= 16; page |= short_pack(srb->cmnd[5], srb->cmnd[4]); pages = short_pack(srb->cmnd[8], srb->cmnd[7]); usb_stor_dbg(us, "WRITE_10: write page %d pagect %d\n", page, pages); result = sddr09_write_data(us, page, pages); return (result == 0 ? USB_STOR_TRANSPORT_GOOD : USB_STOR_TRANSPORT_ERROR); } /* * catch-all for all other commands, except * pass TEST_UNIT_READY and REQUEST_SENSE through */ if (srb->cmnd[0] != TEST_UNIT_READY && srb->cmnd[0] != REQUEST_SENSE) { sensekey = 0x05; /* illegal request */ sensecode = 0x20; /* invalid command */ havefakesense = 1; return USB_STOR_TRANSPORT_FAILED; } for (; srb->cmd_len<12; srb->cmd_len++) srb->cmnd[srb->cmd_len] = 0; srb->cmnd[1] = LUNBITS; ptr[0] = 0; for (i=0; i<12; i++) sprintf(ptr+strlen(ptr), "%02X ", srb->cmnd[i]); usb_stor_dbg(us, "Send control for command %s\n", ptr); result = sddr09_send_scsi_command(us, srb->cmnd, 12); if (result) { usb_stor_dbg(us, "sddr09_send_scsi_command returns %d\n", result); return USB_STOR_TRANSPORT_ERROR; } if (scsi_bufflen(srb) == 0) return USB_STOR_TRANSPORT_GOOD; if (srb->sc_data_direction == DMA_TO_DEVICE || srb->sc_data_direction == DMA_FROM_DEVICE) { unsigned int pipe = (srb->sc_data_direction == DMA_TO_DEVICE) ? us->send_bulk_pipe : us->recv_bulk_pipe; usb_stor_dbg(us, "%s %d bytes\n", (srb->sc_data_direction == DMA_TO_DEVICE) ? "sending" : "receiving", scsi_bufflen(srb)); result = usb_stor_bulk_srb(us, pipe, srb); return (result == USB_STOR_XFER_GOOD ? USB_STOR_TRANSPORT_GOOD : USB_STOR_TRANSPORT_ERROR); } return USB_STOR_TRANSPORT_GOOD; } /* * Initialization routine for the sddr09 subdriver */ static int usb_stor_sddr09_init(struct us_data *us) { return sddr09_common_init(us); } static struct scsi_host_template sddr09_host_template; static int sddr09_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct us_data *us; int result; result = usb_stor_probe1(&us, intf, id, (id - sddr09_usb_ids) + sddr09_unusual_dev_list, &sddr09_host_template); if (result) return result; if (us->protocol == USB_PR_DPCM_USB) { us->transport_name = "Control/Bulk-EUSB/SDDR09"; us->transport = dpcm_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 1; } else { us->transport_name = "EUSB/SDDR09"; us->transport = sddr09_transport; us->transport_reset = usb_stor_CB_reset; us->max_lun = 0; } result = usb_stor_probe2(us); return result; } static struct usb_driver sddr09_driver = { .name = DRV_NAME, .probe = sddr09_probe, .disconnect = usb_stor_disconnect, .suspend = usb_stor_suspend, .resume = usb_stor_resume, .reset_resume = usb_stor_reset_resume, .pre_reset = usb_stor_pre_reset, .post_reset = usb_stor_post_reset, .id_table = sddr09_usb_ids, .soft_unbind = 1, .no_dynamic_id = 1, }; module_usb_stor_driver(sddr09_driver, sddr09_host_template, DRV_NAME);
37 8002 7997 43213 249 5493 8031 8007 42272 442 42254 7287 40630 2 4450 39360 1 39349 16807 16807 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0 */ /* * Variant of atomic_t specialized for reference counts. * * The interface matches the atomic_t interface (to aid in porting) but only * provides the few functions one should use for reference counting. * * Saturation semantics * ==================== * * refcount_t differs from atomic_t in that the counter saturates at * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the * counter and causing 'spurious' use-after-free issues. In order to avoid the * cost associated with introducing cmpxchg() loops into all of the saturating * operations, we temporarily allow the counter to take on an unchecked value * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow * or overflow has occurred. Although this is racy when multiple threads * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly * equidistant from 0 and INT_MAX we minimise the scope for error: * * INT_MAX REFCOUNT_SATURATED UINT_MAX * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) * +--------------------------------+----------------+----------------+ * <---------- bad value! ----------> * * (in a signed view of the world, the "bad value" range corresponds to * a negative counter value). * * As an example, consider a refcount_inc() operation that causes the counter * to overflow: * * int old = atomic_fetch_add_relaxed(r); * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) * if (old < 0) * atomic_set(r, REFCOUNT_SATURATED); * * If another thread also performs a refcount_inc() operation between the two * atomic operations, then the count will continue to edge closer to 0. If it * reaches a value of 1 before /any/ of the threads reset it to the saturated * value, then a concurrent refcount_dec_and_test() may erroneously free the * underlying object. * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). * With the current PID limit, if no batched refcounting operations are used and * the attacker can't repeatedly trigger kernel oopses in the middle of refcount * operations, this makes it impossible for a saturated refcount to leave the * saturation range, even if it is possible for multiple uses of the same * refcount to nest in the context of a single task: * * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = * 0x40000000 / 0x400000 = 0x100 = 256 * * If hundreds of references are added/removed with a single refcounting * operation, it may potentially be possible to leave the saturation range; but * given the precise timing details involved with the round-robin scheduling of * each thread manipulating the refcount and the need to hit the race multiple * times in succession, there doesn't appear to be a practical avenue of attack * even if using refcount_add() operations with larger increments. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object we're increasing the * reference count on will provide the ordering. For locked data structures, * its the lock acquire, for RCU/lockless data structures its the dependent * load. * * Do note that inc_not_zero() provides a control dependency which will order * future stores against the inc, this ensures we'll never modify the object * if we did not in fact acquire a reference. * * The decrements will provide release order, such that all the prior loads and * stores will be issued before, it also provides a control dependency, which * will order us against the subsequent free(). * * The control dependency is against the load of the cmpxchg (ll/sc) that * succeeded. This means the stores aren't fully ordered, but this is fine * because the 1->0 transition indicates no concurrency. * * Note that the allocator is responsible for ordering things between free() * and alloc(). * * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * */ #ifndef _LINUX_REFCOUNT_H #define _LINUX_REFCOUNT_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/limits.h> #include <linux/refcount_types.h> #include <linux/spinlock_types.h> struct mutex; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) enum refcount_saturation_type { REFCOUNT_ADD_NOT_ZERO_OVF, REFCOUNT_ADD_OVF, REFCOUNT_ADD_UAF, REFCOUNT_SUB_UAF, REFCOUNT_DEC_LEAK, }; void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t); /** * refcount_set - set a refcount's value * @r: the refcount * @n: value to which the refcount will be set */ static inline void refcount_set(refcount_t *r, int n) { atomic_set(&r->refs, n); } /** * refcount_read - get a refcount's value * @r: the refcount * * Return: the refcount's value */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); } static inline __must_check __signed_wrap bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) { int old = refcount_read(r); do { if (!old) break; } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i)); if (oldp) *oldp = old; if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); return old; } /** * refcount_add_not_zero - add a value to a refcount unless it is 0 * @i: the value to add to the refcount * @r: the refcount * * Will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. * * Return: false if the passed refcount is 0, true otherwise */ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) { return __refcount_add_not_zero(i, r, NULL); } static inline __signed_wrap void __refcount_add(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_add_relaxed(i, &r->refs); if (oldp) *oldp = old; if (unlikely(!old)) refcount_warn_saturate(r, REFCOUNT_ADD_UAF); else if (unlikely(old < 0 || old + i < 0)) refcount_warn_saturate(r, REFCOUNT_ADD_OVF); } /** * refcount_add - add a value to a refcount * @i: the value to add to the refcount * @r: the refcount * * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_inc(), or one of its variants, should instead be used to * increment a reference count. */ static inline void refcount_add(int i, refcount_t *r) { __refcount_add(i, r, NULL); } static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) { return __refcount_add_not_zero(1, r, oldp); } /** * refcount_inc_not_zero - increment a refcount unless it is 0 * @r: the refcount to increment * * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED * and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. * * Return: true if the increment was successful, false otherwise */ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) { return __refcount_inc_not_zero(r, NULL); } static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); } /** * refcount_inc - increment a refcount * @r: the refcount to increment * * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. * * Provides no memory ordering, it is assumed the caller already has a * reference on the object. * * Will WARN if the refcount is 0, as this represents a possible use-after-free * condition. */ static inline void refcount_inc(refcount_t *r) { __refcount_inc(r, NULL); } static inline __must_check __signed_wrap bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(i, &r->refs); if (oldp) *oldp = old; if (old > 0 && old == i) { smp_acquire__after_ctrl_dep(); return true; } if (unlikely(old <= 0 || old - i < 0)) refcount_warn_saturate(r, REFCOUNT_SUB_UAF); return false; } /** * refcount_sub_and_test - subtract from a refcount and test if it is 0 * @i: amount to subtract from the refcount * @r: the refcount * * Similar to atomic_dec_and_test(), but it will WARN, return false and * ultimately leak on underflow and will fail to decrement when saturated * at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these * cases, refcount_dec(), or one of its variants, should instead be used to * decrement a reference count. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) { return __refcount_sub_and_test(i, r, NULL); } static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) { return __refcount_sub_and_test(1, r, oldp); } /** * refcount_dec_and_test - decrement a refcount and test if it is 0 * @r: the refcount * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ static inline __must_check bool refcount_dec_and_test(refcount_t *r) { return __refcount_dec_and_test(r, NULL); } static inline void __refcount_dec(refcount_t *r, int *oldp) { int old = atomic_fetch_sub_release(1, &r->refs); if (oldp) *oldp = old; if (unlikely(old <= 1)) refcount_warn_saturate(r, REFCOUNT_DEC_LEAK); } /** * refcount_dec - decrement a refcount * @r: the refcount * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before. */ static inline void refcount_dec(refcount_t *r) { __refcount_dec(r, NULL); } extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) __cond_acquires(lock); #endif /* _LINUX_REFCOUNT_H */
2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright (C) 2022, 2024 Intel Corporation */ #ifndef IEEE80211_RATE_H #define IEEE80211_RATE_H #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" struct rate_control_ref { const struct rate_control_ops *ops; void *priv; }; void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc); void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st); void rate_control_rate_init(struct link_sta_info *link_sta); void rate_control_rate_init_all_links(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, struct sta_info *sta, gfp_t gfp) { spin_lock_init(&sta->rate_ctrl_lock); return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp); } static inline void rate_control_free_sta(struct sta_info *sta) { struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; ref->ops->free_sta(ref->priv, ista, priv_sta); } static inline void rate_control_add_sta_debugfs(struct sta_info *sta) { #ifdef CONFIG_MAC80211_DEBUGFS struct rate_control_ref *ref = sta->rate_ctrl; if (ref && sta->debugfs_dir && ref->ops->add_sta_debugfs) ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv, sta->debugfs_dir); #endif } extern const struct debugfs_short_fops rcname_ops; static inline void rate_control_add_debugfs(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfsdir; if (!local->rate_ctrl) return; if (!local->rate_ctrl->ops->add_debugfs) return; debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); local->debugfs.rcdir = debugfsdir; debugfs_create_file("name", 0400, debugfsdir, local->rate_ctrl, &rcname_ops); local->rate_ctrl->ops->add_debugfs(&local->hw, local->rate_ctrl->priv, debugfsdir); #endif } void ieee80211_check_rate_mask(struct ieee80211_link_data *link); /* Get a reference to the rate control algorithm. If `name' is NULL, get the * first available algorithm. */ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name); void rate_control_deinitialize(struct ieee80211_local *local); /* Rate control algorithms */ #ifdef CONFIG_MAC80211_RC_MINSTREL int rc80211_minstrel_init(void); void rc80211_minstrel_exit(void); #else static inline int rc80211_minstrel_init(void) { return 0; } static inline void rc80211_minstrel_exit(void) { } #endif #endif /* IEEE80211_RATE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_XATTR_H #define _BCACHEFS_XATTR_H #include "str_hash.h" extern const struct bch_hash_desc bch2_xattr_hash_desc; int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ .key_validate = bch2_xattr_validate, \ .val_to_text = bch2_xattr_to_text, \ .min_val_size = 8, \ }) static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + name_len + val_len, sizeof(u64)); } #define xattr_val(_xattr) \ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) struct xattr_search_key { u8 type; struct qstr name; }; #define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ { .type = _type, .name = QSTR_INIT(_name, _len) }) struct dentry; struct xattr_handler; struct bch_hash_info; struct bch_inode_info; /* Exported for cmd_migrate.c in tools: */ int bch2_xattr_set(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); extern const struct xattr_handler * const bch2_xattr_handlers[]; #endif /* _BCACHEFS_XATTR_H */
3 3 3 1 3 3 1442 1443 5 5 5 1 5 5 19 93 1914 1915 5 1912 1140 1915 1327 1361 204 205 204 119 118 93 114 3 13 62 524 209 4659 3722 1562 1434 627 1433 1442 187 187 139 1560 1524 9 9 109 1543 97 96 97 97 1135 3552 3438 400 531 531 334 528 507 257 3 525 3 524 528 12 64 1 66 3 207 143 76 34 68 9 67 68 2 68 68 68 67 67 76 39 217 234 182 60 198 199 34 76 26 67 4 28 43 43 71 54 8 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 // SPDX-License-Identifier: GPL-2.0-only /* * mm/truncate.c - code for taking down pages from address_spaces * * Copyright (C) 2002, Linus Torvalds * * 10Sep2002 Andrew Morton * Initial version. */ #include <linux/kernel.h> #include <linux/backing-dev.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include "internal.h" static void clear_shadow_entries(struct address_space *mapping, unsigned long start, unsigned long max) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; /* Handled by shmem itself, or for DAX we do nothing. */ if (shmem_mapping(mapping) || dax_mapping(mapping)) return; xas_set_update(&xas, workingset_update_node); spin_lock(&mapping->host->i_lock); xas_lock_irq(&xas); /* Clear all shadow entries from start to max */ xas_for_each(&xas, folio, max) { if (xa_is_value(folio)) xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } /* * Unconditionally remove exceptional entries. Usually called from truncate * path. Note that the folio_batch may be altered by this function by removing * exceptional entries similar to what folio_batch_remove_exceptionals() does. * Please note that indices[] has entries in ascending order as guaranteed by * either find_get_entries() or find_lock_entries(). */ static void truncate_folio_batch_exceptionals(struct address_space *mapping, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, indices[0]); int nr = folio_batch_count(fbatch); struct folio *folio; int i, j; /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; for (j = 0; j < nr; j++) if (xa_is_value(fbatch->folios[j])) break; if (j == nr) return; if (dax_mapping(mapping)) { for (i = j; i < nr; i++) { if (xa_is_value(fbatch->folios[i])) dax_delete_mapping_entry(mapping, indices[i]); } goto out; } xas_set(&xas, indices[j]); xas_set_update(&xas, workingset_update_node); spin_lock(&mapping->host->i_lock); xas_lock_irq(&xas); xas_for_each(&xas, folio, indices[nr-1]) { if (xa_is_value(folio)) xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); out: folio_batch_remove_exceptionals(fbatch); } /** * folio_invalidate - Invalidate part or all of a folio. * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * * folio_invalidate() is called when all or part of the folio has become * invalidated by a truncate operation. * * folio_invalidate() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ void folio_invalidate(struct folio *folio, size_t offset, size_t length) { const struct address_space_operations *aops = folio->mapping->a_ops; if (aops->invalidate_folio) aops->invalidate_folio(folio, offset, length); } EXPORT_SYMBOL_GPL(folio_invalidate); /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * * We need to bail out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ static void truncate_cleanup_folio(struct folio *folio) { if (folio_mapped(folio)) unmap_mapping_folio(folio); if (folio_needs_release(folio)) folio_invalidate(folio, 0, folio_size(folio)); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ folio_cancel_dirty(folio); } int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { if (folio->mapping != mapping) return -EIO; truncate_cleanup_folio(folio); filemap_remove_folio(folio); return 0; } /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the * folio that's within the [start, end] range, and then split the folio if * it's large. split_page_range() will discard pages which now lie beyond * i_size, and we rely on the caller to discard pages which lie within a * newly created hole. * * Returns false if splitting failed so the caller can avoid * discarding the entire folio which is stubbornly unsplit. */ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); unsigned int offset, length; if (pos < start) offset = start - pos; else offset = 0; length = folio_size(folio); if (pos + length <= (u64)end) length = length - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); if (length == folio_size(folio)) { truncate_inode_folio(folio->mapping, folio); return true; } /* * We may be zeroing pages we're about to discard, but it avoids * doing a complex calculation here, and then doing the zeroing * anyway if the page split fails. */ if (!mapping_inaccessible(folio->mapping)) folio_zero_range(folio, offset, length); if (folio_needs_release(folio)) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; if (split_folio(folio) == 0) return true; if (folio_test_dirty(folio)) return false; truncate_inode_folio(folio->mapping, folio); return true; } /* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_folio(struct address_space *mapping, struct folio *folio) { if (!mapping) return -EINVAL; /* * Only punch for normal data pages for now. * Handling other types like directories would need more auditing. */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; return truncate_inode_folio(mapping, folio); } EXPORT_SYMBOL(generic_error_remove_folio); /** * mapping_evict_folio() - Remove an unused folio from the page-cache. * @mapping: The mapping this folio belongs to. * @folio: The folio to remove. * * Safely remove one folio from the page cache. * It only drops clean, unused folios. * * Context: Folio must be locked. * Return: The number of pages successfully removed. */ long mapping_evict_folio(struct address_space *mapping, struct folio *folio) { /* The page may have been truncated before it was locked */ if (!mapping) return 0; if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ if (folio_ref_count(folio) > folio_nr_pages(folio) + folio_has_private(folio) + 1) return 0; if (!filemap_release_folio(folio, 0)) return 0; return remove_mapping(mapping, folio); } /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidate_folio() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; struct folio *folio; bool same_folio; if (mapping_empty(mapping)) return; /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_SHIFT; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { truncate_folio_batch_exceptionals(mapping, &fbatch, indices); for (i = 0; i < folio_batch_count(&fbatch); i++) truncate_cleanup_folio(fbatch.folios[i]); delete_from_page_cache_batch(mapping, &fbatch); for (i = 0; i < folio_batch_count(&fbatch); i++) folio_unlock(fbatch.folios[i]); folio_batch_release(&fbatch); cond_resched(); } same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { same_folio = lend < folio_pos(folio) + folio_size(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) { folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } } index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ if (xa_is_value(folio)) continue; folio_lock(folio); VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); truncate_inode_folio(mapping, folio); folio_unlock(folio); } truncate_folio_batch_exceptionals(mapping, &fbatch, indices); folio_batch_release(&fbatch); } } EXPORT_SYMBOL(truncate_inode_pages_range); /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_rwsem and * mapping->invalidate_lock. * * Note: When this function returns, there can be a page in the process of * deletion (inside __filemap_remove_folio()) in the specified range. Thus * mapping->nrpages can be non-zero when this function returns even after * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { truncate_inode_pages_range(mapping, lstart, (loff_t)-1); } EXPORT_SYMBOL(truncate_inode_pages); /** * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * * Called under (and serialized by) inode->i_rwsem. * * Filesystems have to use this in the .evict_inode path to inform the * VM that this is the final truncate and the inode is going away. */ void truncate_inode_pages_final(struct address_space *mapping) { /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the * inode teardown. Tell it when the address space is exiting, * so that it does not install eviction information after the * final truncate has begun. */ mapping_set_exiting(mapping); if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree * modification that does not see AS_EXITING is * completed before starting the final truncate. */ xa_lock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages); } truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); /** * mapping_try_invalidate - Invalidate all the evictable folios of one inode * @mapping: the address_space which holds the folios to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * @nr_failed: How many folio invalidations failed * * This function is similar to invalidate_mapping_pages(), except that it * returns the number of folios which could not be evicted in @nr_failed. */ unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; folio_batch_init(&fbatch); while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { xa_has_values = true; count++; continue; } ret = mapping_evict_folio(mapping, folio); folio_unlock(folio); /* * Invalidation is a hint that the folio is no longer * of interest and try to speed up its reclaim. */ if (!ret) { deactivate_file_folio(folio); /* Likely in the lru cache of a remote CPU */ if (nr_failed) (*nr_failed)++; } count += ret; } if (xa_has_values) clear_shadow_entries(mapping, indices[0], indices[nr-1]); folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } return count; } /** * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode * @mapping: the address_space which holds the cache to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function removes pages that are clean, unmapped and unlocked, * as well as shadow entries. It will not block on IO activity. * * If you want to remove all the pages of one inode, regardless of * their use and writeback state, use truncate_inode_pages(). * * Return: The number of indices that had their contents invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { return mapping_try_invalidate(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); /* * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave folios behind because * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ static int invalidate_complete_folio2(struct address_space *mapping, struct folio *folio) { if (folio->mapping != mapping) return 0; if (!filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); if (folio_test_dirty(folio)) goto failed; BUG_ON(folio_has_private(folio)); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); return 0; } static int folio_launder(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return 0; if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) return 0; return mapping->a_ops->launder_folio(folio); } /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space * @start: the page offset 'from' which to invalidate * @end: the page offset 'to' which to invalidate (inclusive) * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; if (mapping_empty(mapping)) return 0; folio_batch_init(&fbatch); index = start; while (find_get_entries(mapping, &index, end, &fbatch, indices)) { bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { xa_has_values = true; if (dax_mapping(mapping) && !dax_invalidate_mapping_entry_sync(mapping, indices[i])) ret = -EBUSY; continue; } if (!did_range_unmap && folio_mapped(folio)) { /* * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, indices[i], (1 + end - indices[i]), false); did_range_unmap = 1; } folio_lock(folio); if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); if (folio_mapped(folio)) unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); ret2 = folio_launder(mapping, folio); if (ret2 == 0) { if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; folio_unlock(folio); } if (xa_has_values) clear_shadow_entries(mapping, indices[0], indices[nr-1]);