Total coverage: 362465 (20%)of 1851499
65 65 15 15 3 1 7 7 1 8 8 8 8 1 6 2 7 9 9 9 9 3 3 3 3 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/module.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfctrl.h> #include <net/caif/cfmuxl.h> #include <net/caif/cffrml.h> #include <net/caif/cfserl.h> #include <net/caif/cfsrvl.h> #include <net/caif/caif_dev.h> #define container_obj(layr) container_of(layr, struct cfcnfg, layer) /* Information about CAIF physical interfaces held by Config Module in order * to manage physical interfaces */ struct cfcnfg_phyinfo { struct list_head node; bool up; /* Pointer to the layer below the MUX (framing layer) */ struct cflayer *frm_layer; /* Pointer to the lowest actual physical layer */ struct cflayer *phy_layer; /* Unique identifier of the physical interface */ unsigned int id; /* Preference of the physical in interface */ enum cfcnfg_phy_preference pref; /* Information about the physical device */ struct dev_info dev_info; /* Interface index */ int ifindex; /* Protocol head room added for CAIF link layer */ int head_room; /* Use Start of frame checksum */ bool use_fcs; }; struct cfcnfg { struct cflayer layer; struct cflayer *ctrl; struct cflayer *mux; struct list_head phys; struct mutex lock; }; static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer); static void cfctrl_resp_func(void); static void cfctrl_enum_resp(void); struct cfcnfg *cfcnfg_create(void) { struct cfcnfg *this; struct cfctrl_rsp *resp; might_sleep(); /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) return NULL; this->mux = cfmuxl_create(); if (!this->mux) goto out_of_mem; this->ctrl = cfctrl_create(); if (!this->ctrl) goto out_of_mem; /* Initiate response functions */ resp = cfctrl_get_respfuncs(this->ctrl); resp->enum_rsp = cfctrl_enum_resp; resp->linkerror_ind = cfctrl_resp_func; resp->linkdestroy_rsp = cfcnfg_linkdestroy_rsp; resp->sleep_rsp = cfctrl_resp_func; resp->wake_rsp = cfctrl_resp_func; resp->restart_rsp = cfctrl_resp_func; resp->radioset_rsp = cfctrl_resp_func; resp->linksetup_rsp = cfcnfg_linkup_rsp; resp->reject_rsp = cfcnfg_reject_rsp; INIT_LIST_HEAD(&this->phys); cfmuxl_set_uplayer(this->mux, this->ctrl, 0); layer_set_dn(this->ctrl, this->mux); layer_set_up(this->ctrl, this); mutex_init(&this->lock); return this; out_of_mem: synchronize_rcu(); kfree(this->mux); kfree(this->ctrl); kfree(this); return NULL; } void cfcnfg_remove(struct cfcnfg *cfg) { might_sleep(); if (cfg) { synchronize_rcu(); kfree(cfg->mux); cfctrl_remove(cfg->ctrl); kfree(cfg); } } static void cfctrl_resp_func(void) { } static struct cfcnfg_phyinfo *cfcnfg_get_phyinfo_rcu(struct cfcnfg *cnfg, u8 phyid) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->id == phyid) return phy; return NULL; } static void cfctrl_enum_resp(void) { } static struct dev_info *cfcnfg_get_phyid(struct cfcnfg *cnfg, enum cfcnfg_phy_preference phy_pref) { /* Try to match with specified preference */ struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) { if (phy->up && phy->pref == phy_pref && phy->frm_layer != NULL) return &phy->dev_info; } /* Otherwise just return something */ list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->up) return &phy->dev_info; return NULL; } static int cfcnfg_get_id_from_ifi(struct cfcnfg *cnfg, int ifi) { struct cfcnfg_phyinfo *phy; list_for_each_entry_rcu(phy, &cnfg->phys, node) if (phy->ifindex == ifi && phy->up) return phy->id; return -ENODEV; } int caif_disconnect_client(struct net *net, struct cflayer *adap_layer) { u8 channel_id; struct cfcnfg *cfg = get_cfcnfg(net); caif_assert(adap_layer != NULL); cfctrl_cancel_req(cfg->ctrl, adap_layer); channel_id = adap_layer->id; if (channel_id != 0) { struct cflayer *servl; servl = cfmuxl_remove_uplayer(cfg->mux, channel_id); cfctrl_linkdown_req(cfg->ctrl, channel_id, adap_layer); if (servl != NULL) layer_set_up(servl, NULL); } else pr_debug("nothing to disconnect\n"); /* Do RCU sync before initiating cleanup */ synchronize_rcu(); if (adap_layer->ctrlcmd != NULL) adap_layer->ctrlcmd(adap_layer, CAIF_CTRLCMD_DEINIT_RSP, 0); return 0; } EXPORT_SYMBOL(caif_disconnect_client); static void cfcnfg_linkdestroy_rsp(struct cflayer *layer, u8 channel_id) { } static const int protohead[CFCTRL_SRV_MASK] = { [CFCTRL_SRV_VEI] = 4, [CFCTRL_SRV_DATAGRAM] = 7, [CFCTRL_SRV_UTIL] = 4, [CFCTRL_SRV_RFM] = 3, [CFCTRL_SRV_DBG] = 3, }; static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, struct caif_connect_request *s, struct cfctrl_link_param *l) { struct dev_info *dev_info; enum cfcnfg_phy_preference pref; int res; memset(l, 0, sizeof(*l)); /* In caif protocol low value is high priority */ l->priority = CAIF_PRIO_MAX - s->priority + 1; if (s->ifindex != 0) { res = cfcnfg_get_id_from_ifi(cnfg, s->ifindex); if (res < 0) return res; l->phyid = res; } else { switch (s->link_selector) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: return -EINVAL; } dev_info = cfcnfg_get_phyid(cnfg, pref); if (dev_info == NULL) return -ENODEV; l->phyid = dev_info->id; } switch (s->protocol) { case CAIFPROTO_AT: l->linktype = CFCTRL_SRV_VEI; l->endpoint = (s->sockaddr.u.at.type >> 2) & 0x3; l->chtype = s->sockaddr.u.at.type & 0x3; break; case CAIFPROTO_DATAGRAM: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_DATAGRAM_LOOP: l->linktype = CFCTRL_SRV_DATAGRAM; l->chtype = 0x03; l->endpoint = 0x00; l->u.datagram.connid = s->sockaddr.u.dgm.connection_id; break; case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; strscpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; strscpy(l->u.utility.name, s->sockaddr.u.util.service, sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) l->u.utility.paramlen = sizeof(l->u.utility.params); memcpy(l->u.utility.params, s->param.data, l->u.utility.paramlen); break; case CAIFPROTO_DEBUG: l->linktype = CFCTRL_SRV_DBG; l->endpoint = s->sockaddr.u.dbg.service; l->chtype = s->sockaddr.u.dbg.type; break; default: return -EINVAL; } return 0; } int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, struct cflayer *adap_layer, int *ifindex, int *proto_head, int *proto_tail) { struct cflayer *frml; struct cfcnfg_phyinfo *phy; int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, &param); if (err) goto unlock; phy = cfcnfg_get_phyinfo_rcu(cfg, param.phyid); if (!phy) { err = -ENODEV; goto unlock; } err = -EINVAL; if (adap_layer == NULL) { pr_err("adap_layer is zero\n"); goto unlock; } if (adap_layer->receive == NULL) { pr_err("adap_layer->receive is NULL\n"); goto unlock; } if (adap_layer->ctrlcmd == NULL) { pr_err("adap_layer->ctrlcmd == NULL\n"); goto unlock; } err = -ENODEV; frml = phy->frm_layer; if (frml == NULL) { pr_err("Specified PHY type does not exist!\n"); goto unlock; } caif_assert(param.phyid == phy->id); caif_assert(phy->frm_layer->id == param.phyid); caif_assert(phy->phy_layer->id == param.phyid); *ifindex = phy->ifindex; *proto_tail = 2; *proto_head = protohead[param.linktype] + phy->head_room; rcu_read_unlock(); /* FIXME: ENUMERATE INITIALLY WHEN ACTIVATING PHYSICAL INTERFACE */ cfctrl_enum_req(cfg->ctrl, param.phyid); return cfctrl_linkup_request(cfg->ctrl, &param, adap_layer); unlock: rcu_read_unlock(); return err; } EXPORT_SYMBOL(caif_connect_client); static void cfcnfg_reject_rsp(struct cflayer *layer, u8 channel_id, struct cflayer *adapt_layer) { if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); } static void cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, u8 phyid, struct cflayer *adapt_layer) { struct cfcnfg *cnfg = container_obj(layer); struct cflayer *servicel = NULL; struct cfcnfg_phyinfo *phyinfo; struct net_device *netdev; if (channel_id == 0) { pr_warn("received channel_id zero\n"); if (adapt_layer != NULL && adapt_layer->ctrlcmd != NULL) adapt_layer->ctrlcmd(adapt_layer, CAIF_CTRLCMD_INIT_FAIL_RSP, 0); return; } rcu_read_lock(); if (adapt_layer == NULL) { pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); goto unlock; } caif_assert(cnfg != NULL); caif_assert(phyid != 0); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { pr_err("ERROR: Link Layer Device disappeared while connecting\n"); goto unlock; } caif_assert(phyinfo != NULL); caif_assert(phyinfo->id == phyid); caif_assert(phyinfo->phy_layer != NULL); caif_assert(phyinfo->phy_layer->id == phyid); adapt_layer->id = channel_id; switch (serv) { case CFCTRL_SRV_VEI: servicel = cfvei_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DATAGRAM: servicel = cfdgml_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_RFM: netdev = phyinfo->dev_info.dev; servicel = cfrfml_create(channel_id, &phyinfo->dev_info, netdev->mtu); break; case CFCTRL_SRV_UTIL: servicel = cfutill_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_VIDEO: servicel = cfvidl_create(channel_id, &phyinfo->dev_info); break; case CFCTRL_SRV_DBG: servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: pr_err("Protocol error. Link setup response - unknown channel type\n"); goto unlock; } if (!servicel) goto unlock; layer_set_dn(servicel, cnfg->mux); cfmuxl_set_uplayer(cnfg->mux, servicel, channel_id); layer_set_up(servicel, adapt_layer); layer_set_dn(adapt_layer, servicel); rcu_read_unlock(); servicel->ctrlcmd(servicel, CAIF_CTRLCMD_INIT_RSP, 0); return; unlock: rcu_read_unlock(); } int cfcnfg_add_phy_layer(struct cfcnfg *cnfg, struct net_device *dev, struct cflayer *phy_layer, enum cfcnfg_phy_preference pref, struct cflayer *link_support, bool fcs, int head_room) { struct cflayer *frml; struct cfcnfg_phyinfo *phyinfo = NULL; int i, res = 0; u8 phyid; mutex_lock(&cnfg->lock); /* CAIF protocol allow maximum 6 link-layers */ for (i = 0; i < 7; i++) { phyid = (dev->ifindex + i) & 0x7; if (phyid == 0) continue; if (cfcnfg_get_phyinfo_rcu(cnfg, phyid) == NULL) goto got_phyid; } pr_warn("Too many CAIF Link Layers (max 6)\n"); res = -EEXIST; goto out; got_phyid: phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC); if (!phyinfo) { res = -ENOMEM; goto out; } phy_layer->id = phyid; phyinfo->pref = pref; phyinfo->id = phyid; phyinfo->dev_info.id = phyid; phyinfo->dev_info.dev = dev; phyinfo->phy_layer = phy_layer; phyinfo->ifindex = dev->ifindex; phyinfo->head_room = head_room; phyinfo->use_fcs = fcs; frml = cffrml_create(phyid, fcs); if (!frml) { res = -ENOMEM; goto out_err; } phyinfo->frm_layer = frml; layer_set_up(frml, cnfg->mux); if (link_support != NULL) { link_support->id = phyid; layer_set_dn(frml, link_support); layer_set_up(link_support, frml); layer_set_dn(link_support, phy_layer); layer_set_up(phy_layer, link_support); } else { layer_set_dn(frml, phy_layer); layer_set_up(phy_layer, frml); } list_add_rcu(&phyinfo->node, &cnfg->phys); out: mutex_unlock(&cnfg->lock); return res; out_err: kfree(phyinfo); mutex_unlock(&cnfg->lock); return res; } EXPORT_SYMBOL(cfcnfg_add_phy_layer); int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, bool up) { struct cfcnfg_phyinfo *phyinfo; rcu_read_lock(); phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phy_layer->id); if (phyinfo == NULL) { rcu_read_unlock(); return -ENODEV; } if (phyinfo->up == up) { rcu_read_unlock(); return 0; } phyinfo->up = up; if (up) { cffrml_hold(phyinfo->frm_layer); cfmuxl_set_dnlayer(cnfg->mux, phyinfo->frm_layer, phy_layer->id); } else { cfmuxl_remove_dnlayer(cnfg->mux, phy_layer->id); cffrml_put(phyinfo->frm_layer); } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(cfcnfg_set_phy_state); int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer) { struct cflayer *frml, *frml_dn; u16 phyid; struct cfcnfg_phyinfo *phyinfo; might_sleep(); mutex_lock(&cnfg->lock); phyid = phy_layer->id; phyinfo = cfcnfg_get_phyinfo_rcu(cnfg, phyid); if (phyinfo == NULL) { mutex_unlock(&cnfg->lock); return 0; } caif_assert(phyid == phyinfo->id); caif_assert(phy_layer == phyinfo->phy_layer); caif_assert(phy_layer->id == phyid); caif_assert(phyinfo->frm_layer->id == phyid); list_del_rcu(&phyinfo->node); synchronize_rcu(); /* Fail if reference count is not zero */ if (cffrml_refcnt_read(phyinfo->frm_layer) != 0) { pr_info("Wait for device inuse\n"); list_add_rcu(&phyinfo->node, &cnfg->phys); mutex_unlock(&cnfg->lock); return -EAGAIN; } frml = phyinfo->frm_layer; frml_dn = frml->dn; cffrml_set_uplayer(frml, NULL); cffrml_set_dnlayer(frml, NULL); if (phy_layer != frml_dn) { layer_set_up(frml_dn, NULL); layer_set_dn(frml_dn, NULL); } layer_set_up(phy_layer, NULL); if (phyinfo->phy_layer != frml_dn) kfree(frml_dn); cffrml_free(frml); kfree(phyinfo); mutex_unlock(&cnfg->lock); return 0; } EXPORT_SYMBOL(cfcnfg_del_phy_layer);
149 149 18 18 18 159 31 50 24 31 50 24 31 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 // SPDX-License-Identifier: GPL-2.0 #include <linux/tty.h> #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/semaphore.h> #include <linux/sched.h> #include "tty.h" /* Legacy tty mutex glue */ /* * Getting the big tty mutex. */ void tty_lock(struct tty_struct *tty) { tty_kref_get(tty); mutex_lock(&tty->legacy_mutex); } EXPORT_SYMBOL(tty_lock); int tty_lock_interruptible(struct tty_struct *tty) { int ret; tty_kref_get(tty); ret = mutex_lock_interruptible(&tty->legacy_mutex); if (ret) tty_kref_put(tty); return ret; } void tty_unlock(struct tty_struct *tty) { mutex_unlock(&tty->legacy_mutex); tty_kref_put(tty); } EXPORT_SYMBOL(tty_unlock); void tty_lock_slave(struct tty_struct *tty) { if (tty && tty != tty->link) tty_lock(tty); } void tty_unlock_slave(struct tty_struct *tty) { if (tty && tty != tty->link) tty_unlock(tty); } void tty_set_lock_subclass(struct tty_struct *tty) { lockdep_set_subclass(&tty->legacy_mutex, TTY_LOCK_SLAVE); }
65 65 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/sunrpc_syms.c * * Symbols exported by the sunrpc module. * * Copyright (C) 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/types.h> #include <linux/uio.h> #include <linux/unistd.h> #include <linux/init.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/auth.h> #include <linux/workqueue.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/xprtsock.h> #include "sunrpc.h" #include "sysfs.h" #include "netns.h" unsigned int sunrpc_net_id; EXPORT_SYMBOL_GPL(sunrpc_net_id); static __net_init int sunrpc_init_net(struct net *net) { int err; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); err = rpc_proc_init(net); if (err) goto err_proc; err = ip_map_cache_create(net); if (err) goto err_ipmap; err = unix_gid_cache_create(net); if (err) goto err_unixgid; err = rpc_pipefs_init_net(net); if (err) goto err_pipefs; INIT_LIST_HEAD(&sn->all_clients); spin_lock_init(&sn->rpc_client_lock); spin_lock_init(&sn->rpcb_clnt_lock); return 0; err_pipefs: unix_gid_cache_destroy(net); err_unixgid: ip_map_cache_destroy(net); err_ipmap: rpc_proc_exit(net); err_proc: return err; } static __net_exit void sunrpc_exit_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); WARN_ON_ONCE(!list_empty(&sn->all_clients)); } static struct pernet_operations sunrpc_net_ops = { .init = sunrpc_init_net, .exit = sunrpc_exit_net, .id = &sunrpc_net_id, .size = sizeof(struct sunrpc_net), }; static int __init init_sunrpc(void) { int err = rpc_init_mempool(); if (err) goto out; err = rpcauth_init_module(); if (err) goto out2; cache_initialize(); err = register_pernet_subsys(&sunrpc_net_ops); if (err) goto out3; err = register_rpc_pipefs(); if (err) goto out4; err = rpc_sysfs_init(); if (err) goto out5; sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); #endif svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ return 0; out5: unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: rpcauth_remove_module(); out2: rpc_destroy_mempool(); out: return err; } static void __exit cleanup_sunrpc(void) { rpc_sysfs_exit(); rpc_cleanup_clids(); xprt_cleanup_ids(); xprt_multipath_cleanup_ids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); sunrpc_debugfs_exit(); unregister_rpc_pipefs(); rpc_destroy_mempool(); unregister_pernet_subsys(&sunrpc_net_ops); auth_domain_cleanup(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_unregister_sysctl(); #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_DESCRIPTION("Sun RPC core"); MODULE_LICENSE("GPL"); fs_initcall(init_sunrpc); /* Ensure we're initialised before nfs */ module_exit(cleanup_sunrpc);
2 1 2 3 106 8 39 58 42 18 3 21 1 65 1 309 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * journal.h * * Defines journalling api and structures. * * Copyright (C) 2003, 2005 Oracle. All rights reserved. */ #ifndef OCFS2_JOURNAL_H #define OCFS2_JOURNAL_H #include <linux/fs.h> #include <linux/jbd2.h> enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, OCFS2_JOURNAL_LOADED, OCFS2_JOURNAL_IN_SHUTDOWN, }; struct ocfs2_super; struct ocfs2_dinode; /* * The recovery_list is a simple linked list of node numbers to recover. * It is protected by the recovery_lock. */ struct ocfs2_recovery_map { unsigned int rm_used; unsigned int rm_entries[]; }; struct ocfs2_journal { enum ocfs2_journal_state j_state; /* Journals current state */ journal_t *j_journal; /* The kernels journal type */ struct inode *j_inode; /* Kernel inode pointing to * this journal */ struct ocfs2_super *j_osb; /* pointer to the super * block for the node * we're currently * running on -- not * necessarily the super * block from the node * which we usually run * from (recovery, * etc) */ struct buffer_head *j_bh; /* Journal disk inode block */ atomic_t j_num_trans; /* Number of transactions * currently in the system. */ spinlock_t j_lock; unsigned long j_trans_id; struct rw_semaphore j_trans_barrier; wait_queue_head_t j_checkpointed; /* both fields protected by j_lock*/ struct list_head j_la_cleanups; struct work_struct j_recovery_work; }; extern spinlock_t trans_inc_lock; /* wrap j_trans_id so we never have it equal to zero. */ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) { unsigned long old_id; spin_lock(&trans_inc_lock); old_id = j->j_trans_id++; if (unlikely(!j->j_trans_id)) j->j_trans_id = 1; spin_unlock(&trans_inc_lock); return old_id; } static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal, struct ocfs2_caching_info *ci) { spin_lock(&trans_inc_lock); ci->ci_last_trans = journal->j_trans_id; spin_unlock(&trans_inc_lock); } /* Used to figure out whether it's safe to drop a metadata lock on an * cached object. Returns true if all the object's changes have been * checkpointed to disk. You should be holding the spinlock on the * metadata lock while calling this to be sure that nobody can take * the lock and put it on another transaction. */ static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci) { int ret; struct ocfs2_journal *journal = OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; spin_lock(&trans_inc_lock); ret = time_after(journal->j_trans_id, ci->ci_last_trans); spin_unlock(&trans_inc_lock); return ret; } /* convenience function to check if an object backed by struct * ocfs2_caching_info is still new (has never hit disk) Will do you a * favor and set created_trans = 0 when you've * been checkpointed. returns '1' if the ci is still new. */ static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci) { int ret; struct ocfs2_journal *journal = OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; spin_lock(&trans_inc_lock); ret = !(time_after(journal->j_trans_id, ci->ci_created_trans)); if (!ret) ci->ci_created_trans = 0; spin_unlock(&trans_inc_lock); return ret; } /* Wrapper for inodes so we can check system files */ static inline int ocfs2_inode_is_new(struct inode *inode) { /* System files are never "new" as they're written out by * mkfs. This helps us early during mount, before we have the * journal open and j_trans_id could be junk. */ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) return 0; return ocfs2_ci_is_new(INODE_CACHE(inode)); } static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, struct ocfs2_caching_info *ci) { spin_lock(&trans_inc_lock); ci->ci_created_trans = osb->journal->j_trans_id; spin_unlock(&trans_inc_lock); } /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_orphan_scan_init(struct ocfs2_super *osb); void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); void ocfs2_complete_recovery(struct work_struct *work); void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); void ocfs2_free_replay_slots(struct ocfs2_super *osb); /* * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. * * ocfs2_journal_alloc - Initialize skeleton for journal structure. * ocfs2_journal_init - Initialize journal structures in the OSB. * ocfs2_journal_load - Load the given journal off disk. Replay it if * there's transactions still in there. * ocfs2_journal_shutdown - Shutdown a journal, this will flush all * uncommitted, uncheckpointed transactions. * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally * zero out each block. * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb. * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat * event on. * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); int ocfs2_journal_alloc(struct ocfs2_super *osb); int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed); int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num); int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) { wake_up(&osb->checkpoint_event); } static inline void ocfs2_checkpoint_inode(struct inode *inode) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (ocfs2_mount_local(osb)) return; if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) { /* WARNING: This only kicks off a single * checkpoint. If someone races you and adds more * metadata to the journal, you won't know, and will * wind up waiting *a lot* longer than necessary. Right * now we only use this in clear_inode so that's * OK. */ ocfs2_start_checkpoint(osb); wait_event(osb->journal->j_checkpointed, ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))); } } /* * Transaction Handling: * Manage the lifetime of a transaction handle. * * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of * the number of blocks that will be changed during * this handle. * ocfs2_commit_trans - Complete a handle. It might return -EIO if * the journal was aborted. The majority of paths don't * check the return value as an error there comes too * late to do anything (and will be picked up in a * later transaction). * ocfs2_extend_trans - Extend a handle by nblocks credits. This may * commit the handle to disk in the process, but will * not release any locks taken during the transaction. * ocfs2_journal_access* - Notify the handle that we want to journal this * buffer. Will have to call ocfs2_journal_dirty once * we've actually dirtied it. Type is one of . or . * Always call the specific flavor of * ocfs2_journal_access_*() unless you intend to * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes * out before the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's * perfectly legal to go through an entire transaction without having * dirtied any buffers. */ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs); int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); int ocfs2_assure_trans_credits(handle_t *handle, int nblocks); int ocfs2_allocate_extend_trans(handle_t *handle, int thresh); /* * Define an arbitrary limit for the amount of data we will anticipate * writing to any given transaction. For unbounded transactions such as * fallocate(2) we can write more than this, but we always * start off at the maximum transaction size and grow the transaction * optimistically as we go. */ #define OCFS2_MAX_TRANS_DATA 64U /* * Create access is for when we get a newly created buffer and we're * not gonna read it off disk, but rather fill it ourselves. Right * now, we don't do anything special with this (it turns into a write * request), but this is a good placeholder in case we do... * * Write access is for when we read a block off disk and are going to * modify it. This way the journalling layer knows it may need to make * a copy of that block (if it's part of another, uncommitted * transaction) before we do so. */ #define OCFS2_JOURNAL_ACCESS_CREATE 0 #define OCFS2_JOURNAL_ACCESS_WRITE 1 #define OCFS2_JOURNAL_ACCESS_UNDO 2 /* ocfs2_inode */ int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_extent_block */ int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_refcount_block */ int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_group_desc */ int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_xattr_block */ int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* quota blocks */ int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* dirblock */ int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_dx_root_block */ int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_dx_leaf */ int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* Anything that has no ecc */ int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* * A word about the journal_access/journal_dirty "dance". It is * entirely legal to journal_access a buffer more than once (as long * as the access type is the same -- I'm not sure what will happen if * access type is different but this should never happen anyway) It is * also legal to journal_dirty a buffer more than once. In fact, you * can even journal_access a buffer after you've done a * journal_access/journal_dirty pair. The only thing you cannot do * however, is journal_dirty a buffer which you haven't yet passed to * journal_access at least once. * * That said, 99% of the time this doesn't matter and this is what the * path looks like: * * <read a bh> * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE); * <modify the bh> * ocfs2_journal_dirty(handle, bh); */ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); /* * Credit Macros: * Convenience macros to calculate number of credits needed. * * For convenience sake, I have a set of macros here which calculate * the *maximum* number of sectors which will be changed for various * metadata updates. */ /* simple file updates like chmod, etc. */ #define OCFS2_INODE_UPDATE_CREDITS 1 /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 /* Update of a single quota block */ #define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 /* global quotafile inode update, data block */ #define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) #define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS /* * The two writes below can accidentally see global info dirty due * to set_info() quotactl so make them prepared for the writes. */ /* quota data block, global info */ /* Write to local quota file */ #define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) /* global quota data block, local quota data block, global quota inode, * global quota info */ #define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) static inline int ocfs2_quota_trans_credits(struct super_block *sb) { int credits = 0; if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) credits += OCFS2_QWRITE_CREDITS; if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) credits += OCFS2_QWRITE_CREDITS; return credits; } /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) /* group add. inode update and the new group update. */ #define OCFS2_GROUP_ADD_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) /* get one bit out of a suballocator: dinode + group descriptor + * prev. group desc. if we relink. */ #define OCFS2_SUBALLOC_ALLOC (3) static inline int ocfs2_inline_to_extents_credits(struct super_block *sb) { return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS + ocfs2_quota_trans_credits(sb); } /* dinode + group descriptor update. We don't relink on free yet. */ #define OCFS2_SUBALLOC_FREE (2) #define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + OCFS2_TRUNCATE_LOG_UPDATE) static inline int ocfs2_remove_extent_credits(struct super_block *sb) { return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS + ocfs2_quota_trans_credits(sb); } /* data block for new dir/symlink, allocation of directory block, dx_root * update for free list */ #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1) static inline int ocfs2_add_dir_index_credits(struct super_block *sb) { /* 1 block for index, 2 allocs (data, metadata), 1 clusters * worth of blocks for initial extent. */ return 1 + 2 * OCFS2_SUBALLOC_ALLOC + ocfs2_clusters_to_blocks(sb, 1); } /* parent fe, parent block, new file entry, index leaf, inode alloc fe, inode * alloc group descriptor + mkdir/symlink blocks + dir blocks + xattr * blocks + quota update */ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, int xattr_credits) { int dir_credits = OCFS2_DIR_LINK_ADDITIONAL_CREDITS; if (is_dir) dir_credits += ocfs2_add_dir_index_credits(sb); return 4 + OCFS2_SUBALLOC_ALLOC + dir_credits + xattr_credits + ocfs2_quota_trans_credits(sb); } /* local alloc metadata change + main bitmap updates */ #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE) /* used when we don't need an allocation change for a dir extend. One * for the dinode, one for the new block. */ #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota * update on dir + index leaf + dx root update for free list + * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } /* inode + dir inode (if we unlink a dir), + dir entry block + orphan * dir inode link + dir inode index leaf + dir index root */ static inline int ocfs2_unlink_credits(struct super_block *sb) { /* The quota update from ocfs2_link_credits is unused here... */ return 2 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_link_credits(sb); } /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + * inode alloc group descriptor + orphan dir index root + * orphan dir index leaf */ #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) /* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + * orphan dir index root + orphan dir index leaf */ #define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) #define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink + 3 x dir index leaves */ static inline int ocfs2_rename_credits(struct super_block *sb) { return 3 * OCFS2_INODE_UPDATE_CREDITS + 6 + ocfs2_unlink_credits(sb); } /* global bitmap dinode, group desc., relinked group, * suballocator dinode, group desc., relinked group, * dinode, xattr block */ #define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \ + OCFS2_INODE_UPDATE_CREDITS \ + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) /* inode update, removal of dx root block from allocator */ #define OCFS2_DX_ROOT_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ OCFS2_SUBALLOC_FREE) static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) { int credits = 1 + OCFS2_SUBALLOC_ALLOC; credits += ocfs2_clusters_to_blocks(sb, 1); credits += ocfs2_quota_trans_credits(sb); return credits; } /* inode update, new refcount block and its allocation credits. */ #define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \ + OCFS2_SUBALLOC_ALLOC) /* inode and the refcount block update. */ #define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) /* * inode and the refcount block update. * It doesn't include the credits for sub alloc change. * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added. */ #define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) /* 2 metadata alloc, 2 new blocks and root refcount block */ #define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3) /* * Please note that the caller must make sure that root_el is the root * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise * the result may be wrong. */ static inline int ocfs2_calc_extend_credits(struct super_block *sb, struct ocfs2_extent_list *root_el) { int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; /* bitmap dinode, group desc. + relinked group. */ bitmap_blocks = OCFS2_SUBALLOC_ALLOC; /* we might need to shift tree depth so lets assume an * absolute worst case of complete fragmentation. Even with * that, we only need one update for the dinode, and then * however many metadata chunks needed * a remaining suballoc * alloc. */ sysfile_bitmap_blocks = 1 + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el); /* this does not include *new* metadata blocks, which are * accounted for in sysfile_bitmap_blocks. root_el + * prev. last_eb_blk + blocks along edge of tree. * calc_symlink_credits passes because we just need 1 * credit for the dinode there. */ extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) { int blocks = ocfs2_mknod_credits(sb, 0, 0); /* links can be longer than one block so we may update many * within our single allocated extent. */ blocks += ocfs2_clusters_to_blocks(sb, 1); return blocks + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, unsigned int cpg) { int blocks; int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1; /* parent inode update + new block group header + bitmap inode update + bitmap blocks affected */ blocks = 1 + 1 + 1 + bitmap_blocks; return blocks; } /* * Allocating a discontiguous block group requires the credits from * ocfs2_calc_group_alloc_credits() as well as enough credits to fill * the group descriptor's extent list. The caller already has started * the transaction with ocfs2_calc_group_alloc_credits(). They extend * it with these credits. */ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) { return ocfs2_extent_recs_per_gd(sb); } static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, loff_t start_byte, loff_t length) { return jbd2_journal_inode_ranged_write(handle, &OCFS2_I(inode)->ip_jinode, start_byte, length); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, loff_t new_size) { return jbd2_journal_begin_ordered_truncate( OCFS2_SB(inode->i_sb)->journal->j_journal, &OCFS2_I(inode)->ip_jinode, new_size); } static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, struct inode *inode, int datasync) { struct ocfs2_inode_info *oi = OCFS2_I(inode); if (!is_handle_aborted(handle)) { oi->i_sync_tid = handle->h_transaction->t_tid; if (datasync) oi->i_datasync_tid = handle->h_transaction->t_tid; } } #endif /* OCFS2_JOURNAL_H */
421 441 441 440 441 441 76 61 439 441 80 61 428 19 441 441 441 441 394 61 32 27 441 1934 1931 441 441 441 441 441 61 441 278 278 728 565 417 414 7 220 138 104 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dsa/user.c - user device handling * Copyright (c) 2008-2009 Marvell Semiconductor */ #include <linux/list.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #include <linux/phylink.h> #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/selftests.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> #include <linux/string.h> #include "conduit.h" #include "dsa.h" #include "netlink.h" #include "port.h" #include "switch.h" #include "tag.h" #include "user.h" struct dsa_switchdev_event_work { struct net_device *dev; struct net_device *orig_dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and * SWITCHDEV_FDB_DEL_TO_DEVICE */ unsigned char addr[ETH_ALEN]; u16 vid; bool host_addr; }; enum dsa_standalone_event { DSA_UC_ADD, DSA_UC_DEL, DSA_MC_ADD, DSA_MC_DEL, }; struct dsa_standalone_event_work { struct work_struct work; struct net_device *dev; enum dsa_standalone_event event; unsigned char addr[ETH_ALEN]; u16 vid; }; struct dsa_host_vlan_rx_filtering_ctx { struct net_device *dev; const unsigned char *addr; enum dsa_standalone_event event; }; static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) { return ds->ops->port_fdb_add && ds->ops->port_fdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) { return ds->ops->port_mdb_add && ds->ops->port_mdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static void dsa_user_standalone_event_work(struct work_struct *work) { struct dsa_standalone_event_work *standalone_work = container_of(work, struct dsa_standalone_event_work, work); const unsigned char *addr = standalone_work->addr; struct net_device *dev = standalone_work->dev; struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_mdb mdb; struct dsa_switch *ds = dp->ds; u16 vid = standalone_work->vid; int err; switch (standalone_work->event) { case DSA_UC_ADD: err = dsa_port_standalone_host_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_UC_DEL: err = dsa_port_standalone_host_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; case DSA_MC_ADD: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_add(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to mdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_MC_DEL: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_del(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from mdb: %d\n", dp->index, addr, vid, err); } break; } kfree(standalone_work); } static int dsa_user_schedule_standalone_work(struct net_device *dev, enum dsa_standalone_event event, const unsigned char *addr, u16 vid) { struct dsa_standalone_event_work *standalone_work; standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); if (!standalone_work) return -ENOMEM; INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work); standalone_work->event = event; standalone_work->dev = dev; ether_addr_copy(standalone_work->addr, addr); standalone_work->vid = vid; dsa_schedule_work(&standalone_work->work); return 0; } static int dsa_user_host_vlan_rx_filtering(void *arg, int vid) { struct dsa_host_vlan_rx_filtering_ctx *ctx = arg; return dsa_user_schedule_standalone_work(ctx->dev, ctx->event, ctx->addr, vid); } static int dsa_user_vlan_for_each(struct net_device *dev, int (*cb)(void *arg, int vid), void *arg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_vlan *v; int err; lockdep_assert_held(&dev->addr_list_lock); err = cb(arg, 0); if (err) return err; list_for_each_entry(v, &dp->user_vlans, list) { err = cb(arg, v->vid); if (err) return err; } return 0; } static int dsa_user_sync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_ADD, }; dev_uc_add(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_DEL, }; dev_uc_del(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_sync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_ADD, }; dev_mc_add(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_DEL, }; dev_mc_del(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } void dsa_user_sync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_sync_mc(dev, ha->addr); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_sync_uc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } void dsa_user_unsync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_unsync_uc(dev, ha->addr); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_unsync_mc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } /* user mii_bus handling ***************************************************/ static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_read(ds, addr, reg); return 0xffff; } static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_write(ds, addr, reg, val); return 0; } void dsa_user_mii_bus_init(struct dsa_switch *ds) { ds->user_mii_bus->priv = (void *)ds; ds->user_mii_bus->name = "dsa user smi"; ds->user_mii_bus->read = dsa_user_phy_read; ds->user_mii_bus->write = dsa_user_phy_write; snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", ds->dst->index, ds->index); ds->user_mii_bus->parent = ds->dev; ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask; } /* user device handling ****************************************************/ static int dsa_user_get_iflink(const struct net_device *dev) { return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err; if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, addr, 0); if (err) goto out; } if (!ether_addr_equal(addr, conduit->dev_addr)) { err = dev_uc_add(conduit, addr); if (err < 0) goto del_host_addr; } return 0; del_host_addr: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, addr, 0); out: return err; } void dsa_user_host_uc_uninstall(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); } static int dsa_user_open(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); int err; err = dev_open(conduit, NULL); if (err < 0) { netdev_err(dev, "failed to open conduit %s\n", conduit->name); goto out; } err = dsa_user_host_uc_install(dev, dev->dev_addr); if (err) goto out; err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto out_del_host_uc; return 0; out_del_host_uc: dsa_user_host_uc_uninstall(dev); out: return err; } static int dsa_user_close(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); dsa_port_disable_rt(dp); dsa_user_host_uc_uninstall(dev); return 0; } static void dsa_user_manage_host_flood(struct net_device *dev) { bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI); struct dsa_port *dp = dsa_user_to_port(dev); bool uc = dev->flags & IFF_PROMISC; dsa_port_set_host_flood(dp, uc, mc); } static void dsa_user_change_rx_flags(struct net_device *dev, int change) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (change & IFF_ALLMULTI) dev_set_allmulti(conduit, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(conduit, dev->flags & IFF_PROMISC ? 1 : -1); if (dsa_switch_supports_uc_filtering(ds) && dsa_switch_supports_mc_filtering(ds)) dsa_user_manage_host_flood(dev); } static void dsa_user_set_rx_mode(struct net_device *dev) { __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc); __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc); } static int dsa_user_set_mac_address(struct net_device *dev, void *a) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (ds->ops->port_set_mac_address) { err = ds->ops->port_set_mac_address(ds, dp->index, addr->sa_data); if (err) return err; } /* If the port is down, the address isn't synced yet to hardware or * to the DSA conduit, so there is nothing to change. */ if (!(dev->flags & IFF_UP)) goto out_change_dev_addr; err = dsa_user_host_uc_install(dev, addr->sa_data); if (err) return err; dsa_user_host_uc_uninstall(dev); out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; } struct dsa_user_dump_ctx { struct net_device *dev; struct sk_buff *skb; struct netlink_callback *cb; int idx; }; static int dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, sizeof(*ndm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dump->dev->ifindex; ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid)) goto nla_put_failure; nlmsg_end(dump->skb, nlh); skip: dump->idx++; return 0; nla_put_failure: nlmsg_cancel(dump->skb, nlh); return -EMSGSIZE; } static int dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; int err; err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump); *idx = dump.idx; return err; } static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int ret; if (ctx && ctx != dp) return 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_PORT_MST_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_MST: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_mst_enable(dp, attr->u.mst, extack); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_VLAN_MSTI: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti); break; default: ret = -EOPNOTSUPP; break; } return ret; } /* Must be called under rcu_read_lock() */ static int dsa_user_vlan_check_for_8021q_uppers(struct net_device *user, const struct switchdev_obj_port_vlan *vlan) { struct net_device *upper_dev; struct list_head *iter; netdev_for_each_upper_dev_rcu(user, upper_dev, iter) { u16 vid; if (!is_vlan_dev(upper_dev)) continue; vid = vlan_dev_vlan_id(upper_dev); if (vid == vlan->vid) return -EBUSY; } return 0; } static int dsa_user_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; int err; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) { rcu_read_lock(); err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG_MOD(extack, "Port already has a VLAN upper with this VID"); return err; } } return dsa_port_vlan_add(dp, vlan, extack); } /* Offload a VLAN installed on the bridge or on a foreign interface by * installing it as a VLAN towards the CPU port. */ static int dsa_user_host_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); /* Even though drivers often handle CPU membership in special ways, * it doesn't make sense to program a PVID, so clear this flag. */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; return dsa_port_host_vlan_add(dp, &vlan, extack); } static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_add(dev, obj, extack); else err = dsa_user_host_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static int dsa_user_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_vlan_del(dp, vlan); } static int dsa_user_host_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_host_vlan_del(dp, vlan); } static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_del(dev, obj); else err = dsa_user_host_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_user_priv *p = netdev_priv(dev); return netpoll_send_skb(p->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF)) return; if (!ds->ops->port_txtstamp) return; ds->ops->port_txtstamp(ds, p->dp->index, skb); } netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) { /* SKB for netpoll still need to be mangled with the protocol-specific * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) return dsa_user_netpoll_send_skb(dev, skb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ skb->dev = dsa_user_to_conduit(dev); dev_queue_xmit(skb); return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct sk_buff *nskb; dev_sw_netstats_tx_add(dev, 1, skb->len); memset(skb->cb, 0, sizeof(skb->cb)); /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); if (skb_ensure_writable_head_tail(skb, dev)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* needed_tailroom should still be 'warm' in the cache line from * skb_ensure_writable_head_tail(), which has also ensured that * padding is safe. */ if (dev->needed_tailroom) eth_skb_pad(skb); /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ nskb = p->xmit(skb, dev); if (!nskb) { kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } /* ethtool operations *******************************************************/ static void dsa_user_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } static int dsa_user_get_regs_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } static void dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) ds->ops->get_regs(ds, dp->index, regs, _p); } static int dsa_user_nway_reset(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_nway_reset(dp->pl); } static int dsa_user_get_eeprom_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->ops->get_eeprom_len) return ds->ops->get_eeprom_len(ds); return 0; } static int dsa_user_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static int dsa_user_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static void dsa_user_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { ethtool_puts(&data, "tx_packets"); ethtool_puts(&data, "tx_bytes"); ethtool_puts(&data, "rx_packets"); ethtool_puts(&data, "rx_bytes"); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data); } else if (stringset == ETH_SS_TEST) { net_selftest_get_strings(data); } } static void dsa_user_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_user_get_sset_count(struct net_device *dev, int sset) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count = 0; if (ds->ops->get_sset_count) { count = ds->ops->get_sset_count(ds, dp->index, sset); if (count < 0) return count; } return count + 4; } else if (sset == ETH_SS_TEST) { return net_selftest_get_count(); } return -EOPNOTSUPP; } static void dsa_user_get_eth_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_phy_stats) ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); } static void dsa_user_get_eth_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_mac_stats) ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); } static void dsa_user_get_eth_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_ctrl_stats) ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); } static void dsa_user_get_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_rmon_stats) ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); } static void dsa_user_get_ts_stats(struct net_device *dev, struct ethtool_ts_stats *ts_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_ts_stats) ds->ops->get_ts_stats(ds, dp->index, ts_stats); } static void dsa_user_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { struct dsa_port *dp = dsa_user_to_port(ndev); struct dsa_switch *ds = dp->ds; if (ds->ops->self_test) { ds->ops->self_test(ds, dp->index, etest, buf); return; } net_selftest(ndev, etest, buf); } static int dsa_user_get_mm(struct net_device *dev, struct ethtool_mm_state *state) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_mm) return -EOPNOTSUPP; return ds->ops->get_mm(ds, dp->index, state); } static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_mm) return -EOPNOTSUPP; return ds->ops->set_mm(ds, dp->index, cfg, extack); } static void dsa_user_get_mm_stats(struct net_device *dev, struct ethtool_mm_stats *stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_mm_stats) ds->ops->get_mm_stats(ds, dp->index, stats); } static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; phylink_ethtool_get_wol(dp->pl, w); if (ds->ops->get_wol) ds->ops->get_wol(ds, dp->index, w); } static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; phylink_ethtool_set_wol(dp->pl, w); if (ds->ops->set_wol) ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* If the port is using phylink managed EEE, then an unimplemented * set_mac_eee() is permissible. */ if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) { /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } else if (ds->ops->set_mac_eee) { ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } return phylink_ethtool_set_eee(dp->pl, e); } static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; return phylink_ethtool_get_eee(dp->pl, e); } static int dsa_user_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_get(dp->pl, cmd); } static int dsa_user_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_set(dp->pl, cmd); } static void dsa_user_get_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_pause_stats) ds->ops->get_pause_stats(ds, dp->index, pause_stats); } static void dsa_user_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); phylink_ethtool_get_pauseparam(dp->pl, pause); } static int dsa_user_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_set_pauseparam(dp->pl, pause); } #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_user_netpoll_setup(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); if (!netpoll) return -ENOMEM; err = __netpoll_setup(netpoll, conduit); if (err) { kfree(netpoll); goto out; } p->netpoll = netpoll; out: return err; } static void dsa_user_netpoll_cleanup(struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll = p->netpoll; if (!netpoll) return; p->netpoll = NULL; __netpoll_free(netpoll); } static void dsa_user_poll_controller(struct net_device *dev) { } #endif static struct dsa_mall_tc_entry * dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) if (mall_tc_entry->cookie == cookie) return mall_tc_entry; return NULL; } static int dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress, bool ingress_target) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_mirror_tc_entry *mirror; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; struct dsa_port *to_dp; int err; if (cls->common.protocol != htons(ETH_P_ALL)) { NL_SET_ERR_MSG_MOD(extack, "Can only offload \"protocol all\" matchall filter"); return -EOPNOTSUPP; } if (!ds->ops->port_mirror_add) { NL_SET_ERR_MSG_MOD(extack, "Switch does not support mirroring operation"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; if (!act->dev) return -EINVAL; if (dsa_user_dev_check(act->dev)) { if (ingress_target) { /* We can only fulfill this using software assist */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to ingress of DSA user port if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } else { to_dp = dsa_user_to_port(act->dev); } } else { /* Handle mirroring to foreign target ports as a mirror towards * the CPU. The software tc rule will take the packets from * there. */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to CPU if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } if (dp->ds != to_dp->ds) { NL_SET_ERR_MSG_MOD(extack, "Cross-chip mirroring not implemented"); return -EOPNOTSUPP; } mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; mirror->to_local_port = to_dp->index; mirror->ingress = ingress; err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall_police(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, "Only supported on ingress qdisc"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, "Only one port policer allowed"); return -EEXIST; } } act = &cls->rule->action.entries[0]; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; policer->rate_bytes_per_sec = act->police.rate_bytes_ps; policer->burst = act->police.burst; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { const struct flow_action *action = &cls->rule->action; struct netlink_ext_ack *extack = cls->common.extack; if (!flow_offload_has_one_action(action)) { NL_SET_ERR_MSG_MOD(extack, "Cannot offload matchall filter with more than one action"); return -EOPNOTSUPP; } switch (action->entries[0].id) { case FLOW_ACTION_MIRRED: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, false); case FLOW_ACTION_MIRRED_INGRESS: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, true); case FLOW_ACTION_POLICE: return dsa_user_add_cls_matchall_police(dev, cls, ingress); default: NL_SET_ERR_MSG_MOD(extack, "Unknown action"); break; } return -EOPNOTSUPP; } static void dsa_user_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; list_del(&mall_tc_entry->list); switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: if (ds->ops->port_mirror_del) ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; case DSA_PORT_MALL_POLICER: if (ds->ops->port_policer_del) ds->ops->port_policer_del(ds, dp->index); break; default: WARN_ON(1); } kfree(mall_tc_entry); } static int dsa_user_setup_tc_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { if (cls->common.chain_index) return -EOPNOTSUPP; switch (cls->command) { case TC_CLSMATCHALL_REPLACE: return dsa_user_add_cls_matchall(dev, cls, ingress); case TC_CLSMATCHALL_DESTROY: dsa_user_del_cls_matchall(dev, cls); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_add_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_add) return -EOPNOTSUPP; return ds->ops->cls_flower_add(ds, port, cls, ingress); } static int dsa_user_del_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_del) return -EOPNOTSUPP; return ds->ops->cls_flower_del(ds, port, cls, ingress); } static int dsa_user_stats_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_stats) return -EOPNOTSUPP; return ds->ops->cls_flower_stats(ds, port, cls, ingress); } static int dsa_user_setup_tc_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { switch (cls->command) { case FLOW_CLS_REPLACE: return dsa_user_add_cls_flower(dev, cls, ingress); case FLOW_CLS_DESTROY: return dsa_user_del_cls_flower(dev, cls, ingress); case FLOW_CLS_STATS: return dsa_user_stats_cls_flower(dev, cls, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv, bool ingress) { struct net_device *dev = cb_priv; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSMATCHALL: return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress); case TC_SETUP_CLSFLOWER: return dsa_user_setup_tc_cls_flower(dev, type_data, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true); } static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false); } static LIST_HEAD(dsa_user_block_cb_list); static int dsa_user_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) { struct flow_block_cb *block_cb; flow_setup_cb_t *cb; if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_user_setup_tc_block_cb_ig; else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) cb = dsa_user_setup_tc_block_cb_eg; else return -EOPNOTSUPP; f->driver_block_list = &dsa_user_block_cb_list; switch (f->command) { case FLOW_BLOCK_BIND: if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list)) return -EBUSY; block_cb = flow_block_cb_alloc(cb, dev, dev, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: block_cb = flow_block_cb_lookup(f->block, cb, dev); if (!block_cb) return -ENOENT; flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port, void *type_data) { struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port)); if (!conduit->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data); } static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; switch (type) { case TC_SETUP_BLOCK: return dsa_user_setup_tc_block(dev, type_data); case TC_SETUP_FT: return dsa_user_setup_ft_block(ds, dp->index, type_data); default: break; } if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } static int dsa_user_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_user_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; return ds->ops->set_rxnfc(ds, dp->index, nfc); } static int dsa_user_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *ts) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; if (!ds->ops->get_ts_info) return -EOPNOTSUPP; return ds->ops->get_ts_info(ds, p->dp->index, ts); } static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct netlink_ext_ack extack = {0}; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int ret; /* User port... */ ret = dsa_port_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return ret; } /* And CPU port... */ ret = dsa_port_host_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, extack._msg); return ret; } if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; v = kzalloc(sizeof(*v), GFP_KERNEL); if (!v) { ret = -ENOMEM; goto rollback; } netif_addr_lock_bh(dev); v->vid = vid; list_add_tail(&v->list, &dp->user_vlans); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_ADD, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_ADD, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; rollback: dsa_port_host_vlan_del(dp, &vlan); dsa_port_vlan_del(dp, &vlan); return ret; } static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int err; err = dsa_port_vlan_del(dp, &vlan); if (err) return err; err = dsa_port_host_vlan_del(dp, &vlan); if (err) return err; if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; netif_addr_lock_bh(dev); v = dsa_vlan_find(&dp->user_vlans, &vlan); if (!v) { netif_addr_unlock_bh(dev); return -ENOENT; } list_del(&v->list); kfree(v); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_DEL, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_DEL, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; } static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_add_vid(arg, proto, vid); } static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_kill_vid(arg, proto, vid); } /* Keep the VLAN RX filtering list in sync with the hardware only if VLAN * filtering is enabled. The baseline is that only ports that offload a * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, * but there are exceptions for quirky hardware. * * If ds->vlan_filtering_is_global = true, then standalone ports which share * the same switch with other ports that offload a VLAN-aware bridge are also * inevitably VLAN-aware. * * To summarize, a DSA switch port offloads: * * - If standalone (this includes software bridge, software LAG): * - if ds->needs_standalone_vlan_filtering = true, OR if * (ds->vlan_filtering_is_global = true AND there are bridges spanning * this switch chip which have vlan_filtering=1) * - the 8021q upper VLANs * - else (standalone VLAN filtering is not needed, VLAN filtering is not * global, or it is, but no port is under a VLAN-aware bridge): * - no VLAN (any 8021q upper is a software VLAN) * * - If under a vlan_filtering=0 bridge which it offload: * - if ds->configure_vlan_while_not_filtering = true (default): * - the bridge VLANs. These VLANs are committed to hardware but inactive. * - else (deprecated): * - no VLAN. The bridge VLANs are not restored when VLAN awareness is * enabled, so this behavior is broken and discouraged. * * - If under a vlan_filtering=1 bridge which it offload: * - the bridge VLANs * - the 8021q upper VLANs */ int dsa_user_manage_vlan_filtering(struct net_device *user, bool vlan_filtering) { int err; if (vlan_filtering) { user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; err = vlan_for_each(user, dsa_user_restore_vlan, user); if (err) { vlan_for_each(user, dsa_user_clear_vlan, user); user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; return err; } } else { err = vlan_for_each(user, dsa_user_clear_vlan, user); if (err) return err; user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } return 0; } struct dsa_hw_port { struct list_head list; struct net_device *dev; int old_mtu; }; static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu) { const struct dsa_hw_port *p; int err; list_for_each_entry(p, hw_port_list, list) { if (p->dev->mtu == mtu) continue; err = dev_set_mtu(p->dev, mtu); if (err) goto rollback; } return 0; rollback: list_for_each_entry_continue_reverse(p, hw_port_list, list) { if (p->dev->mtu == p->old_mtu) continue; if (dev_set_mtu(p->dev, p->old_mtu)) netdev_err(p->dev, "Failed to restore MTU\n"); } return err; } static void dsa_hw_port_list_free(struct list_head *hw_port_list) { struct dsa_hw_port *p, *n; list_for_each_entry_safe(p, n, hw_port_list, list) kfree(p); } /* Make the hardware datapath to/from @dev limited to a common MTU */ static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; int min_mtu = ETH_MAX_MTU; struct dsa_port *other_dp; int err; if (!dp->ds->mtu_enforcement_ingress) return; if (!dp->bridge) return; INIT_LIST_HEAD(&hw_port_list); /* Populate the list of ports that are part of the same bridge * as the newly added/modified port */ list_for_each_entry(dst, &dsa_tree_list, list) { list_for_each_entry(other_dp, &dst->ports, list) { struct dsa_hw_port *hw_port; struct net_device *user; if (other_dp->type != DSA_PORT_TYPE_USER) continue; if (!dsa_port_bridge_same(dp, other_dp)) continue; if (!other_dp->ds->mtu_enforcement_ingress) continue; user = other_dp->user; if (min_mtu > user->mtu) min_mtu = user->mtu; hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); if (!hw_port) goto out; hw_port->dev = user; hw_port->old_mtu = user->mtu; list_add(&hw_port->list, &hw_port_list); } } /* Attempt to configure the entire hardware bridge to the newly added * interface's MTU first, regardless of whether the intention of the * user was to raise or lower it. */ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu); if (!err) goto out; /* Clearly that didn't work out so well, so just set the minimum MTU on * all hardware bridge ports now. If this fails too, then all ports will * still have their old MTU rolled back anyway. */ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu); out: dsa_hw_port_list_free(&hw_port_list); } int dsa_user_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_port *cpu_dp = dp->cpu_dp; struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int largest_mtu = 0; int new_conduit_mtu; int old_conduit_mtu; int mtu_limit; int overhead; int cpu_mtu; int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; dsa_tree_for_each_user_port(other_dp, ds->dst) { int user_mtu; /* During probe, this function will be called for each user * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ if (!other_dp->user) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ if (dp == other_dp) user_mtu = new_mtu; else user_mtu = other_dp->user->mtu; if (largest_mtu < user_mtu) largest_mtu = user_mtu; } overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops); mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead); old_conduit_mtu = conduit->mtu; new_conduit_mtu = largest_mtu + overhead; if (new_conduit_mtu > mtu_limit) return -ERANGE; /* If the conduit MTU isn't over limit, there's no need to check the CPU * MTU, since that surely isn't either. */ cpu_mtu = largest_mtu; /* Start applying stuff */ if (new_conduit_mtu != old_conduit_mtu) { err = dev_set_mtu(conduit, new_conduit_mtu); if (err < 0) goto out_conduit_failed; /* We only need to propagate the MTU of the CPU port to * upstream switches, so emit a notifier which updates them. */ err = dsa_port_mtu_change(cpu_dp, cpu_mtu); if (err) goto out_cpu_failed; } err = ds->ops->port_change_mtu(ds, dp->index, new_mtu); if (err) goto out_port_failed; WRITE_ONCE(dev->mtu, new_mtu); dsa_bridge_mtu_normalization(dp); return 0; out_port_failed: if (new_conduit_mtu != old_conduit_mtu) dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead); out_cpu_failed: if (new_conduit_mtu != old_conduit_mtu) dev_set_mtu(conduit, old_conduit_mtu); out_conduit_failed: return err; } static int __maybe_unused dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_set_apptrust) return -EOPNOTSUPP; return ds->ops->port_set_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_get_apptrust) return -EOPNOTSUPP; return ds->ops->port_get_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } return 0; } /* Update the DSCP prio entries on all user ports of the switch in case * the switch supports global DSCP prio instead of per port DSCP prios. */ static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev, struct dcb_app *app, bool del) { int (*setdel)(struct net_device *dev, struct dcb_app *app); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int err, restore_err; if (del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; err = setdel(user, app); if (err) goto err_try_to_restore; } return 0; err_try_to_restore: /* Revert logic to restore previous state of app entries */ if (!del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; restore_err = setdel(user, app); if (restore_err) netdev_err(user, "Failed to restore DSCP prio entry configuration\n"); } return err; } static int __maybe_unused dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_add_dscp_prio) return -EOPNOTSUPP; if (dscp >= 64) { netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n", dscp); return -EINVAL; } err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false); if (err) { if (ds->ops->port_del_dscp_prio) ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio); dcb_ieee_delapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_set_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_add_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } static int __maybe_unused dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = mask ? __fls(mask) : 0; err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_del_dscp_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority); if (err) { dcb_ieee_setapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true); if (err) { if (ds->ops->port_add_dscp_prio) ds->ops->port_add_dscp_prio(ds, port, dscp, app->priority); dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_del_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_del_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } /* Pre-populate the DCB application priority table with the priorities * configured during switch setup, which we read from hardware here. */ static int dsa_user_dcbnl_init(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; int err; if (ds->ops->port_get_default_prio) { int prio = ds->ops->port_get_default_prio(ds, port); struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE, .protocol = 0, .priority = prio, }; if (prio < 0) return prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } if (ds->ops->port_get_dscp_prio) { int protocol; for (protocol = 0; protocol < 64; protocol++) { struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_DSCP, .protocol = protocol, }; int prio; prio = ds->ops->port_get_dscp_prio(ds, port, protocol); if (prio == -EOPNOTSUPP) continue; if (prio < 0) return prio; app.priority = prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } } return 0; } static const struct ethtool_ops dsa_user_ethtool_ops = { .get_drvinfo = dsa_user_get_drvinfo, .get_regs_len = dsa_user_get_regs_len, .get_regs = dsa_user_get_regs, .nway_reset = dsa_user_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = dsa_user_get_eeprom_len, .get_eeprom = dsa_user_get_eeprom, .set_eeprom = dsa_user_set_eeprom, .get_strings = dsa_user_get_strings, .get_ethtool_stats = dsa_user_get_ethtool_stats, .get_sset_count = dsa_user_get_sset_count, .get_eth_phy_stats = dsa_user_get_eth_phy_stats, .get_eth_mac_stats = dsa_user_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats, .get_rmon_stats = dsa_user_get_rmon_stats, .get_ts_stats = dsa_user_get_ts_stats, .set_wol = dsa_user_set_wol, .get_wol = dsa_user_get_wol, .set_eee = dsa_user_set_eee, .get_eee = dsa_user_get_eee, .get_link_ksettings = dsa_user_get_link_ksettings, .set_link_ksettings = dsa_user_set_link_ksettings, .get_pause_stats = dsa_user_get_pause_stats, .get_pauseparam = dsa_user_get_pauseparam, .set_pauseparam = dsa_user_set_pauseparam, .get_rxnfc = dsa_user_get_rxnfc, .set_rxnfc = dsa_user_set_rxnfc, .get_ts_info = dsa_user_get_ts_info, .self_test = dsa_user_net_selftest, .get_mm = dsa_user_get_mm, .set_mm = dsa_user_set_mm, .get_mm_stats = dsa_user_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = { .ieee_setapp = dsa_user_dcbnl_ieee_setapp, .ieee_delapp = dsa_user_dcbnl_ieee_delapp, .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust, .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust, }; static void dsa_user_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_stats64) ds->ops->get_stats64(ds, dp->index, s); else dev_get_tstats64(dev, s); } static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct dsa_port *dp = dsa_user_to_port(ctx->dev); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_port *cpu_dp = dp->cpu_dp; path->dev = ctx->dev; path->type = DEV_PATH_DSA; path->dsa.proto = cpu_dp->tag_ops->proto; path->dsa.port = dp->index; ctx->dev = conduit; return 0; } static int dsa_user_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->port_hwtstamp_get) return -EOPNOTSUPP; return ds->ops->port_hwtstamp_get(ds, dp->index, cfg); } static int dsa_user_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->port_hwtstamp_set) return -EOPNOTSUPP; return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack); } static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, .ndo_start_xmit = dsa_user_xmit, .ndo_change_rx_flags = dsa_user_change_rx_flags, .ndo_set_rx_mode = dsa_user_set_rx_mode, .ndo_set_mac_address = dsa_user_set_mac_address, .ndo_fdb_dump = dsa_user_fdb_dump, .ndo_eth_ioctl = dsa_user_ioctl, .ndo_get_iflink = dsa_user_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_user_netpoll_setup, .ndo_netpoll_cleanup = dsa_user_netpoll_cleanup, .ndo_poll_controller = dsa_user_poll_controller, #endif .ndo_setup_tc = dsa_user_setup_tc, .ndo_get_stats64 = dsa_user_get_stats64, .ndo_vlan_rx_add_vid = dsa_user_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, .ndo_hwtstamp_get = dsa_user_hwtstamp_get, .ndo_hwtstamp_set = dsa_user_hwtstamp_set, }; static const struct device_type dsa_type = { .name = "dsa", }; void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); if (dp->pl) phylink_mac_change(dp->pl, up); } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_user_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would * not be called if it was not. */ ds->ops->phylink_fixed_state(ds, dp->index, state); } /* user device setup *******************************************************/ static int dsa_user_phy_connect(struct net_device *user_dev, int addr, u32 flags) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_switch *ds = dp->ds; user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr); if (!user_dev->phydev) { netdev_err(user_dev, "no phy at %d\n", addr); return -ENODEV; } user_dev->phydev->dev_flags |= flags; return phylink_connect_phy(dp->pl, user_dev->phydev); } static int dsa_user_phy_setup(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; u32 phy_flags = 0; int ret; dp->pl_config.dev = &user_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; /* The get_fixed_state callback takes precedence over polling the * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set * this if the switch provides such a callback. */ if (ds->ops->phylink_fixed_state) { dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state; dp->pl_config.poll_fixed_state = true; } ret = dsa_port_phylink_create(dp); if (ret) return ret; if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); if (ret == -ENODEV && ds->user_mii_bus) { /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags); } if (ret) { netdev_err(user_dev, "failed to connect to PHY: %pe\n", ERR_PTR(ret)); dsa_port_phylink_destroy(dp); } return ret; } void dsa_user_setup_tagger(struct net_device *user) { struct dsa_port *dp = dsa_user_to_port(user); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_user_priv *p = netdev_priv(user); const struct dsa_port *cpu_dp = dp->cpu_dp; const struct dsa_switch *ds = dp->ds; user->needed_headroom = cpu_dp->tag_ops->needed_headroom; user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the conduit) * by also inheriting the conduit's needed headroom and tailroom. * The 8021q driver also does this. */ user->needed_headroom += conduit->needed_headroom; user->needed_tailroom += conduit->needed_tailroom; p->xmit = cpu_dp->tag_ops->xmit; user->features = conduit->vlan_features | NETIF_F_HW_TC; user->hw_features |= NETIF_F_HW_TC; if (user->needed_tailroom) user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; user->lltx = true; } int dsa_user_suspend(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_detach(user_dev); rtnl_lock(); phylink_stop(dp->pl); rtnl_unlock(); return 0; } int dsa_user_resume(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_attach(user_dev); rtnl_lock(); phylink_start(dp->pl); rtnl_unlock(); return 0; } int dsa_user_create(struct dsa_port *port) { struct net_device *conduit = dsa_port_to_conduit(port); struct dsa_switch *ds = port->ds; struct net_device *user_dev; struct dsa_user_priv *p; const char *name; int assign_type; int ret; if (!ds->num_tx_queues) ds->num_tx_queues = 1; if (port->name) { name = port->name; assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; assign_type = NET_NAME_ENUM; } user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name, assign_type, ether_setup, ds->num_tx_queues, 1); if (user_dev == NULL) return -ENOMEM; user_dev->rtnl_link_ops = &dsa_link_ops; user_dev->ethtool_ops = &dsa_user_ethtool_ops; #if IS_ENABLED(CONFIG_DCB) user_dev->dcbnl_ops = &dsa_user_dcbnl_ops; #endif if (!is_zero_ether_addr(port->mac)) eth_hw_addr_set(user_dev, port->mac); else eth_hw_addr_inherit(user_dev, conduit); user_dev->priv_flags |= IFF_NO_QUEUE; if (dsa_switch_supports_uc_filtering(ds)) user_dev->priv_flags |= IFF_UNICAST_FLT; user_dev->netdev_ops = &dsa_user_netdev_ops; if (ds->ops->port_max_mtu) user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); SET_NETDEV_DEVTYPE(user_dev, &dsa_type); SET_NETDEV_DEV(user_dev, port->ds->dev); SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port); user_dev->dev.of_node = port->dn; user_dev->vlan_features = conduit->vlan_features; p = netdev_priv(user_dev); user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; ret = gro_cells_init(&p->gcells, user_dev); if (ret) goto out_free; p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); port->user = user_dev; dsa_user_setup_tagger(user_dev); netif_carrier_off(user_dev); ret = dsa_user_phy_setup(user_dev); if (ret) { netdev_err(user_dev, "error %d setting up PHY for tree %d, switch %d, port %d\n", ret, ds->dst->index, ds->index, port->index); goto out_gcells; } rtnl_lock(); ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN); if (ret && ret != -EOPNOTSUPP) dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n", ret, ETH_DATA_LEN, port->index); ret = register_netdevice(user_dev); if (ret) { netdev_err(conduit, "error %d registering interface %s\n", ret, user_dev->name); rtnl_unlock(); goto out_phy; } if (IS_ENABLED(CONFIG_DCB)) { ret = dsa_user_dcbnl_init(user_dev); if (ret) { netdev_err(user_dev, "failed to initialize DCB: %pe\n", ERR_PTR(ret)); rtnl_unlock(); goto out_unregister; } } ret = netdev_upper_dev_link(conduit, user_dev, NULL); rtnl_unlock(); if (ret) goto out_unregister; return 0; out_unregister: unregister_netdev(user_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(p->dp); out_gcells: gro_cells_destroy(&p->gcells); out_free: free_netdev(user_dev); port->user = NULL; return ret; } void dsa_user_destroy(struct net_device *user_dev) { struct net_device *conduit = dsa_user_to_conduit(user_dev); struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_user_priv *p = netdev_priv(user_dev); netif_carrier_off(user_dev); rtnl_lock(); netdev_upper_dev_unlink(conduit, user_dev); unregister_netdevice(user_dev); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); free_netdev(user_dev); } int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit, struct netlink_ext_ack *extack) { struct net_device *old_conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct net_device *upper; struct list_head *iter; int err; if (conduit == old_conduit) return 0; if (!ds->ops->port_change_conduit) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support changing DSA conduit"); return -EOPNOTSUPP; } if (!netdev_uses_dsa(conduit)) { NL_SET_ERR_MSG_MOD(extack, "Interface not eligible as DSA conduit"); return -EOPNOTSUPP; } netdev_for_each_upper_dev_rcu(conduit, upper, iter) { if (dsa_user_dev_check(upper)) continue; if (netif_is_bridge_master(upper)) continue; NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers"); return -EOPNOTSUPP; } /* Since we allow live-changing the DSA conduit, plus we auto-open the * DSA conduit when the user port opens => we need to ensure that the * new DSA conduit is open too. */ if (dev->flags & IFF_UP) { err = dev_open(conduit, extack); if (err) return err; } netdev_upper_dev_unlink(old_conduit, dev); err = netdev_upper_dev_link(conduit, dev, extack); if (err) goto out_revert_old_conduit_unlink; err = dsa_port_change_conduit(dp, conduit, extack); if (err) goto out_revert_conduit_link; /* Update the MTU of the new CPU port through cross-chip notifiers */ err = dsa_user_change_mtu(dev, dev->mtu); if (err && err != -EOPNOTSUPP) { netdev_warn(dev, "nonfatal error updating MTU with new conduit: %pe\n", ERR_PTR(err)); } return 0; out_revert_conduit_link: netdev_upper_dev_unlink(conduit, dev); out_revert_old_conduit_unlink: netdev_upper_dev_link(old_conduit, dev, NULL); return err; } bool dsa_user_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_user_netdev_ops; } EXPORT_SYMBOL_GPL(dsa_user_dev_check); static int dsa_user_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return err; dp = dsa_user_to_port(dev); extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_lag_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (is_hsr_master(info->upper_dev)) { if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_hsr_leave(dp, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return NOTIFY_DONE; dp = dsa_user_to_port(dev); if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot * meaningfully placed under a bridge yet */ return NOTIFY_DONE; } static int dsa_user_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_changeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } /* Same as dsa_user_lag_changeupper() except that it calls * dsa_user_prechangeupper() */ static int dsa_user_lag_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_prechangeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *ext_ack; struct net_device *user, *br; struct dsa_port *dp; ext_ack = netdev_notifier_info_to_extack(&info->info); if (!is_vlan_dev(dev)) return NOTIFY_DONE; user = vlan_dev_real_dev(dev); if (!dsa_user_dev_check(user)) return NOTIFY_DONE; dp = dsa_user_to_port(user); br = dsa_port_bridge_dev_get(dp); if (!br) return NOTIFY_DONE; /* Deny enslaving a VLAN device into a VLAN-aware bridge */ if (br_vlan_enabled(br) && netif_is_bridge_master(info->upper_dev) && info->linking) { NL_SET_ERR_MSG_MOD(ext_ack, "Cannot make VLAN device join VLAN-aware bridge"); return notifier_from_errno(-EINVAL); } return NOTIFY_DONE; } static int dsa_user_check_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_user_to_port(dev); struct net_device *br = dsa_port_bridge_dev_get(dp); struct bridge_vlan_info br_info; struct netlink_ext_ack *extack; int err = NOTIFY_DONE; u16 vid; if (!br || !br_vlan_enabled(br)) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); vid = vlan_dev_vlan_id(info->upper_dev); /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { NL_SET_ERR_MSG_MOD(extack, "This VLAN is already configured by the bridge"); return notifier_from_errno(-EBUSY); } return NOTIFY_DONE; } static int dsa_user_prechangeupper_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_switch *ds; struct dsa_port *dp; int err; if (!dsa_user_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, info); dp = dsa_user_to_port(dev); ds = dp->ds; if (ds->ops->port_prechangeupper) { err = ds->ops->port_prechangeupper(ds, dp->index, info); if (err) return notifier_from_errno(err); } if (is_vlan_dev(info->upper_dev)) return dsa_user_check_8021q_upper(dev, info); return NOTIFY_DONE; } /* To be eligible as a DSA conduit, a LAG must have all lower interfaces be * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of * switches in the same switch tree. */ static int dsa_lag_conduit_validate(struct net_device *lag_dev, struct netlink_ext_ack *extack) { struct net_device *lower1, *lower2; struct list_head *iter1, *iter2; netdev_for_each_lower_dev(lag_dev, lower1, iter1) { netdev_for_each_lower_dev(lag_dev, lower2, iter2) { if (!netdev_uses_dsa(lower1) || !netdev_uses_dsa(lower2)) { NL_SET_ERR_MSG_MOD(extack, "All LAG ports must be eligible as DSA conduits"); return notifier_from_errno(-EINVAL); } if (lower1 == lower2) continue; if (!dsa_port_tree_same(lower1->dsa_ptr, lower2->dsa_ptr)) { NL_SET_ERR_MSG_MOD(extack, "LAG contains DSA conduits of disjoint switch trees"); return notifier_from_errno(-EINVAL); } } } return NOTIFY_DONE; } static int dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); if (!netdev_uses_dsa(conduit)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; /* Allow DSA switch uppers */ if (dsa_user_dev_check(info->upper_dev)) return NOTIFY_DONE; /* Allow bridge uppers of DSA conduits, subject to further * restrictions in dsa_bridge_prechangelower_sanity_check() */ if (netif_is_bridge_master(info->upper_dev)) return NOTIFY_DONE; /* Allow LAG uppers, subject to further restrictions in * dsa_lag_conduit_prechangelower_sanity_check() */ if (netif_is_lag_master(info->upper_dev)) return dsa_lag_conduit_validate(info->upper_dev, extack); NL_SET_ERR_MSG_MOD(extack, "DSA conduit cannot join unknown upper interfaces"); return notifier_from_errno(-EBUSY); } static int dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); struct net_device *lag_dev = info->upper_dev; struct net_device *lower; struct list_head *iter; if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; if (!netdev_uses_dsa(dev)) { NL_SET_ERR_MSG(extack, "Only DSA conduits can join a LAG DSA conduit"); return notifier_from_errno(-EINVAL); } netdev_for_each_lower_dev(lag_dev, lower, iter) { if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) { NL_SET_ERR_MSG(extack, "Interface is DSA conduit for a different switch tree than this LAG"); return notifier_from_errno(-EINVAL); } break; } return NOTIFY_DONE; } /* Don't allow bridging of DSA conduits, since the bridge layer rx_handler * prevents the DSA fake ethertype handler to be invoked, so we don't get the * chance to strip off and parse the DSA switch tag protocol header (the bridge * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these * frames). * The only case where that would not be an issue is when bridging can already * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev * port, and is bridged only with other ports from the same hardware device. */ static int dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, struct netdev_notifier_changeupper_info *info) { struct net_device *br = info->upper_dev; struct netlink_ext_ack *extack; struct net_device *lower; struct list_head *iter; if (!netif_is_bridge_master(br)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); netdev_for_each_lower_dev(br, lower, iter) { if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower)) continue; if (!netdev_port_same_parent_id(lower, new_lower)) { NL_SET_ERR_MSG(extack, "Cannot do software bridging with a DSA conduit"); return notifier_from_errno(-EINVAL); } } return NOTIFY_DONE; } static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst, struct net_device *lag_dev) { struct net_device *new_conduit = dsa_tree_find_first_conduit(dst); struct dsa_port *dp; int err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, new_conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", new_conduit->name, ERR_PTR(err)); } } } static int dsa_conduit_lag_join(struct net_device *conduit, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack) { struct dsa_port *cpu_dp = conduit->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; int err; err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack); if (err) return err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != conduit) continue; err = dsa_user_change_conduit(dp->user, lag_dev, extack); if (err) goto restore; } return 0; restore: dsa_tree_for_each_user_port_continue_reverse(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", conduit->name, ERR_PTR(err)); } } dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); return err; } static void dsa_conduit_lag_leave(struct net_device *conduit, struct net_device *lag_dev) { struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *new_cpu_dp = NULL; struct net_device *lower; struct list_head *iter; netdev_for_each_lower_dev(lag_dev, lower, iter) { if (netdev_uses_dsa(lower)) { new_cpu_dp = lower->dsa_ptr; break; } } if (new_cpu_dp) { /* Update the CPU port of the user ports still under the LAG * so that dsa_port_to_conduit() continues to work properly */ dsa_tree_for_each_user_port(dp, dst) if (dsa_port_to_conduit(dp) == lag_dev) dp->cpu_dp = new_cpu_dp; /* Update the index of the virtual CPU port to match the lowest * physical CPU port */ lag_dev->dsa_ptr = new_cpu_dp; wmb(); } else { /* If the LAG DSA conduit has no ports left, migrate back all * user ports to the first physical CPU port */ dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev); } /* This DSA conduit has left its LAG in any case, so let * the CPU port leave the hardware LAG as well */ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); } static int dsa_conduit_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; if (!netdev_uses_dsa(dev)) return err; extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_conduit_lag_join(dev, info->upper_dev, info->upper_info, extack); err = notifier_from_errno(err); } else { dsa_conduit_lag_leave(dev, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; int err; err = dsa_user_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_conduit_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_lag_conduit_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_bridge_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_user_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGEUPPER: { int err; err = dsa_user_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_conduit_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; int err = 0; if (dsa_user_dev_check(dev)) { dp = dsa_user_to_port(dev); err = dsa_port_lag_change(dp, info->lower_state_info); } /* Mirror LAG port events on DSA conduits that are in * a LAG towards their respective switch CPU ports */ if (netdev_uses_dsa(dev)) { dp = dev->dsa_ptr; err = dsa_port_lag_change(dp, info->lower_state_info); } return notifier_from_errno(err); } case NETDEV_CHANGE: case NETDEV_UP: { /* Track state of conduit port. * DSA driver may require the conduit port (and indirectly * the tagger) to be available for some special operation. */ if (netdev_uses_dsa(dev)) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->ds->dst; /* Track when the conduit port is UP */ dsa_tree_conduit_oper_state_change(dst, dev, netif_oper_up(dev)); /* Track when the conduit port is ready and can accept * packet. * NETDEV_UP event is not enough to flag a port as ready. * We also have to wait for linkwatch_do_dev to dev_activate * and emit a NETDEV_CHANGE event. * We check if a conduit port is ready by checking if the dev * have a qdisc assigned and is not noop. */ dsa_tree_conduit_admin_state_change(dst, dev, !qdisc_tx_is_noop(dev)); return NOTIFY_OK; } return NOTIFY_DONE; } case NETDEV_GOING_DOWN: { struct dsa_port *dp, *cpu_dp; struct dsa_switch_tree *dst; LIST_HEAD(close_list); if (!netdev_uses_dsa(dev)) return NOTIFY_DONE; cpu_dp = dev->dsa_ptr; dst = cpu_dp->ds->dst; dsa_tree_conduit_admin_state_change(dst, dev, false); list_for_each_entry(dp, &dst->ports, list) { if (!dsa_port_is_user(dp)) continue; if (dp->cpu_dp != cpu_dp) continue; list_add(&dp->user->close_list, &close_list); } netif_close_many(&close_list, true); return NOTIFY_OK; } default: break; } return NOTIFY_DONE; } static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { struct switchdev_notifier_fdb_info info = {}; info.addr = switchdev_work->addr; info.vid = switchdev_work->vid; info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, switchdev_work->orig_dev, &info.info, NULL); } static void dsa_user_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); const unsigned char *addr = switchdev_work->addr; struct net_device *dev = switchdev_work->dev; u16 vid = switchdev_work->vid; struct dsa_switch *ds; struct dsa_port *dp; int err; dp = dsa_user_to_port(dev); ds = dp->ds; switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_add(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_add(dp, addr, vid); else err = dsa_port_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } dsa_fdb_offload_notify(switchdev_work); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_del(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_del(dp, addr, vid); else err = dsa_port_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; } kfree(switchdev_work); } static bool dsa_foreign_dev_check(const struct net_device *dev, const struct net_device *foreign_dev) { const struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch_tree *dst = dp->ds->dst; if (netif_is_bridge_master(foreign_dev)) return !dsa_tree_offloads_bridge_dev(dst, foreign_dev); if (netif_is_bridge_port(foreign_dev)) return !dsa_tree_offloads_bridge_port(dst, foreign_dev); /* Everything else is foreign */ return true; } static int dsa_user_fdb_event(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info) { struct dsa_switchdev_event_work *switchdev_work; struct dsa_port *dp = dsa_user_to_port(dev); bool host_addr = fdb_info->is_local; struct dsa_switch *ds = dp->ds; if (ctx && ctx != dp) return 0; if (!dp->bridge) return 0; if (switchdev_fdb_is_dynamically_learned(fdb_info)) { if (dsa_port_offloads_bridge_port(dp, orig_dev)) return 0; /* FDB entries learned by the software bridge or by foreign * bridge ports should be installed as host addresses only if * the driver requests assisted learning. */ if (!ds->assisted_learning_on_cpu_port) return 0; } /* Also treat FDB entries on foreign interfaces bridged with us as host * addresses. */ if (dsa_foreign_dev_check(dev, orig_dev)) host_addr = true; /* Check early that we're not doing work in vain. * Host addresses on LAG ports still require regular FDB ops, * since the CPU port isn't in a LAG. */ if (dp->lag && !host_addr) { if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del) return -EOPNOTSUPP; } else { if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) return -EOPNOTSUPP; } switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", orig_dev->name, fdb_info->addr, fdb_info->vid, host_addr ? " as host address" : ""); INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work); switchdev_work->event = event; switchdev_work->dev = dev; switchdev_work->orig_dev = orig_dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; switchdev_work->host_addr = host_addr; dsa_schedule_work(&switchdev_work->work); return 0; } /* Called under rcu_read_lock() */ static int dsa_user_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: err = switchdev_handle_fdb_event_to_device(dev, event, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_fdb_event); return notifier_from_errno(err); default: return NOTIFY_DONE; } return NOTIFY_OK; } static int dsa_user_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_OBJ_ADD: err = switchdev_handle_port_obj_add_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_add); return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: err = switchdev_handle_port_obj_del_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_del); return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); } return NOTIFY_DONE; } static struct notifier_block dsa_user_nb __read_mostly = { .notifier_call = dsa_user_netdevice_event, }; struct notifier_block dsa_user_switchdev_notifier = { .notifier_call = dsa_user_switchdev_event, }; struct notifier_block dsa_user_switchdev_blocking_notifier = { .notifier_call = dsa_user_switchdev_blocking_event, }; int dsa_user_register_notifier(void) { struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_user_nb); if (err) return err; err = register_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) goto err_switchdev_nb; nb = &dsa_user_switchdev_blocking_notifier; err = register_switchdev_blocking_notifier(nb); if (err) goto err_switchdev_blocking_nb; return 0; err_switchdev_blocking_nb: unregister_switchdev_notifier(&dsa_user_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_user_nb); return err; } void dsa_user_unregister_notifier(void) { struct notifier_block *nb; int err; nb = &dsa_user_switchdev_blocking_notifier; err = unregister_switchdev_blocking_notifier(nb); if (err) pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); err = unregister_netdevice_notifier(&dsa_user_nb); if (err) pr_err("DSA: failed to unregister user notifier (%d)\n", err); }
23 24 13 3 3 2 49 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #ifndef _SECURITY_LANDLOCK_RULESET_H #define _SECURITY_LANDLOCK_RULESET_H #include <linux/cleanup.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include "access.h" #include "limits.h" #include "object.h" struct landlock_hierarchy; /** * struct landlock_layer - Access rights for a given layer */ struct landlock_layer { /** * @level: Position of this layer in the layer stack. */ u16 level; /** * @access: Bitfield of allowed actions on the kernel object. They are * relative to the object type (e.g. %LANDLOCK_ACTION_FS_READ). */ access_mask_t access; }; /** * union landlock_key - Key of a ruleset's red-black tree */ union landlock_key { /** * @object: Pointer to identify a kernel object (e.g. an inode). */ struct landlock_object *object; /** * @data: Raw data to identify an arbitrary 32-bit value * (e.g. a TCP port). */ uintptr_t data; }; /** * enum landlock_key_type - Type of &union landlock_key */ enum landlock_key_type { /** * @LANDLOCK_KEY_INODE: Type of &landlock_ruleset.root_inode's node * keys. */ LANDLOCK_KEY_INODE = 1, /** * @LANDLOCK_KEY_NET_PORT: Type of &landlock_ruleset.root_net_port's * node keys. */ LANDLOCK_KEY_NET_PORT, }; /** * struct landlock_id - Unique rule identifier for a ruleset */ struct landlock_id { /** * @key: Identifies either a kernel object (e.g. an inode) or * a raw value (e.g. a TCP port). */ union landlock_key key; /** * @type: Type of a landlock_ruleset's root tree. */ const enum landlock_key_type type; }; /** * struct landlock_rule - Access rights tied to an object */ struct landlock_rule { /** * @node: Node in the ruleset's red-black tree. */ struct rb_node node; /** * @key: A union to identify either a kernel object (e.g. an inode) or * a raw data value (e.g. a network socket port). This is used as a key * for this ruleset element. The pointer is set once and never * modified. It always points to an allocated object because each rule * increments the refcount of its object. */ union landlock_key key; /** * @num_layers: Number of entries in @layers. */ u32 num_layers; /** * @layers: Stack of layers, from the latest to the newest, implemented * as a flexible array member (FAM). */ struct landlock_layer layers[] __counted_by(num_layers); }; /** * struct landlock_ruleset - Landlock ruleset * * This data structure must contain unique entries, be updatable, and quick to * match an object. */ struct landlock_ruleset { /** * @root_inode: Root of a red-black tree containing &struct * landlock_rule nodes with inode object. Once a ruleset is tied to a * process (i.e. as a domain), this tree is immutable until @usage * reaches zero. */ struct rb_root root_inode; #if IS_ENABLED(CONFIG_INET) /** * @root_net_port: Root of a red-black tree containing &struct * landlock_rule nodes with network port. Once a ruleset is tied to a * process (i.e. as a domain), this tree is immutable until @usage * reaches zero. */ struct rb_root root_net_port; #endif /* IS_ENABLED(CONFIG_INET) */ /** * @hierarchy: Enables hierarchy identification even when a parent * domain vanishes. This is needed for the ptrace protection. */ struct landlock_hierarchy *hierarchy; union { /** * @work_free: Enables to free a ruleset within a lockless * section. This is only used by * landlock_put_ruleset_deferred() when @usage reaches zero. * The fields @lock, @usage, @num_rules, @num_layers and * @access_masks are then unused. */ struct work_struct work_free; struct { /** * @lock: Protects against concurrent modifications of * @root, if @usage is greater than zero. */ struct mutex lock; /** * @usage: Number of processes (i.e. domains) or file * descriptors referencing this ruleset. */ refcount_t usage; /** * @num_rules: Number of non-overlapping (i.e. not for * the same object) rules in this ruleset. */ u32 num_rules; /** * @num_layers: Number of layers that are used in this * ruleset. This enables to check that all the layers * allow an access request. A value of 0 identifies a * non-merged ruleset (i.e. not a domain). */ u32 num_layers; /** * @access_masks: Contains the subset of filesystem and * network actions that are restricted by a ruleset. * A domain saves all layers of merged rulesets in a * stack (FAM), starting from the first layer to the * last one. These layers are used when merging * rulesets, for user space backward compatibility * (i.e. future-proof), and to properly handle merged * rulesets without overlapping access rights. These * layers are set once and never changed for the * lifetime of the ruleset. */ struct access_masks access_masks[]; }; }; }; struct landlock_ruleset * landlock_create_ruleset(const access_mask_t access_mask_fs, const access_mask_t access_mask_net, const access_mask_t scope_mask); void landlock_put_ruleset(struct landlock_ruleset *const ruleset); void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset); DEFINE_FREE(landlock_put_ruleset, struct landlock_ruleset *, if (!IS_ERR_OR_NULL(_T)) landlock_put_ruleset(_T)) int landlock_insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const access_mask_t access); struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset); const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_id id); static inline void landlock_get_ruleset(struct landlock_ruleset *const ruleset) { if (ruleset) refcount_inc(&ruleset->usage); } /** * landlock_union_access_masks - Return all access rights handled in the * domain * * @domain: Landlock ruleset (used as a domain) * * Returns: an access_masks result of the OR of all the domain's access masks. */ static inline struct access_masks landlock_union_access_masks(const struct landlock_ruleset *const domain) { union access_masks_all matches = {}; size_t layer_level; for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { union access_masks_all layer = { .masks = domain->access_masks[layer_level], }; matches.all |= layer.all; } return matches.masks; } static inline void landlock_add_fs_access_mask(struct landlock_ruleset *const ruleset, const access_mask_t fs_access_mask, const u16 layer_level) { access_mask_t fs_mask = fs_access_mask & LANDLOCK_MASK_ACCESS_FS; /* Should already be checked in sys_landlock_create_ruleset(). */ WARN_ON_ONCE(fs_access_mask != fs_mask); ruleset->access_masks[layer_level].fs |= fs_mask; } static inline void landlock_add_net_access_mask(struct landlock_ruleset *const ruleset, const access_mask_t net_access_mask, const u16 layer_level) { access_mask_t net_mask = net_access_mask & LANDLOCK_MASK_ACCESS_NET; /* Should already be checked in sys_landlock_create_ruleset(). */ WARN_ON_ONCE(net_access_mask != net_mask); ruleset->access_masks[layer_level].net |= net_mask; } static inline void landlock_add_scope_mask(struct landlock_ruleset *const ruleset, const access_mask_t scope_mask, const u16 layer_level) { access_mask_t mask = scope_mask & LANDLOCK_MASK_SCOPE; /* Should already be checked in sys_landlock_create_ruleset(). */ WARN_ON_ONCE(scope_mask != mask); ruleset->access_masks[layer_level].scope |= mask; } static inline access_mask_t landlock_get_fs_access_mask(const struct landlock_ruleset *const ruleset, const u16 layer_level) { /* Handles all initially denied by default access rights. */ return ruleset->access_masks[layer_level].fs | _LANDLOCK_ACCESS_FS_INITIALLY_DENIED; } static inline access_mask_t landlock_get_net_access_mask(const struct landlock_ruleset *const ruleset, const u16 layer_level) { return ruleset->access_masks[layer_level].net; } static inline access_mask_t landlock_get_scope_mask(const struct landlock_ruleset *const ruleset, const u16 layer_level) { return ruleset->access_masks[layer_level].scope; } bool landlock_unmask_layers(const struct landlock_rule *const rule, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const size_t masks_array_size); access_mask_t landlock_init_layer_masks(const struct landlock_ruleset *const domain, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const enum landlock_key_type key_type); #endif /* _SECURITY_LANDLOCK_RULESET_H */
4506 1163 455 1130 4 1082 118 48 1128 4 1130 1133 1 955 959 278 736 959 445 906 5 5 905 216 906 5 55 956 38 45 44 40 593 422 178 22 235 311 315 314 171 105 295 1074 1077 1074 762 298 315 52 315 315 295 6 858 689 11 677 674 678 7 7 1 8 665 596 103 104 92 15 253 2 253 251 253 203 71 70 9 68 251 9 245 2 1 1 1 12 15 17 16 62 63 1 1 61 2 45 44 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 // SPDX-License-Identifier: GPL-2.0-only /* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * * 09Apr2002 Andrew Morton * Initial version. */ /** * DOC: Readahead Overview * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the * readahead flag set. This flag indicates that the folio was read * as part of a previous readahead request and now that it has been * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async * readahead. This is reflected in the struct file_ra_state which * contains ->size being the total number of pages, and ->async_size * which is the number of pages in the async section. The readahead * flag will be set on the first folio in this async section to trigger * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and * all readahead request will be fully asynchronous. * * When either of the triggers causes a readahead, three numbers need * to be determined: the start of the region to read, the size of the * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page * cache. This is found with a simple search in the page cache. * * The size of the async tail is determined by subtracting the size that * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new * readahead, though see get_next_ra_size() for details. * * If the size of the previous read cannot be determined, the number of * preceding pages in the page cache is used to estimate the size of * a previous read. This estimate could easily be misled by random * reads being coincidentally adjacent, so it is ignored unless it is * larger than the current request, and it is not scaled up, unless it * is at the start of file. * * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * * The above calculation, based on the previous readahead size, * determines the size of the readahead, to which any requested read * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * * ->readahead() will generally call readahead_folio() repeatedly to get * each folio from those prepared for readahead. It may fail to read a * folio by: * * * not calling readahead_folio() sufficiently many times, effectively * ignoring some folios, as might be appropriate if the path to * storage is congested. * * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * * In the last two cases, the folio should be unlocked by the filesystem * to indicate that the read attempt has failed. In the first case the * folio will be unlocked by the VFS. * * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. * In this case it is best to use filemap_remove_folio() to remove the * folios from the page cache as is automatically done for folios that * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using * ->read_folio() which may be less efficient. */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> #define CREATE_TRACE_POINTS #include <trace/events/readahead.h> #include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) return; if (unlikely(rac->_workingset)) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { aops->readahead(rac); /* Clean up the remaining folios. */ while ((folio = readahead_folio(rac)) != NULL) { folio_get(folio); filemap_remove_folio(folio); folio_unlock(folio); folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); if (unlikely(rac->_workingset)) psi_memstall_leave(&rac->_pflags); rac->_workingset = false; BUG_ON(readahead_count(rac)); } static struct folio *ractl_alloc_folio(struct readahead_control *ractl, gfp_t gfp_mask, unsigned int order) { struct folio *folio; folio = filemap_alloc_folio(gfp_mask, order); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); return folio; } /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * * This function is for filesystems to call when they want to start * readahead beyond a file's stated i_size. This is almost certainly * not the function you want to call. Use page_cache_async_readahead() * or page_cache_sync_readahead() instead. * * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long mark = ULONG_MAX, i = 0; unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted * them for I/O. Adding another page may need to allocate memory, * which can trigger memory reclaim. Telling the VM we're in * the middle of a filesystem operation will cause it to not * touch file-backed pages, preventing a deadlock. Most (all?) * filesystems already specify __GFP_NOFS in their mapping's * gfp_mask, but let's be explicit here. */ unsigned int nofs = memalloc_nofs_save(); trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read, lookahead_size); filemap_invalidate_lock_shared(mapping); index = mapping_align_index(mapping, index); /* * As iterator `i` is aligned to min_nrpages, round_up the * difference between nr_to_read and lookahead_size to mark the * index that only has lookahead or "async_region" to set the * readahead flag. */ if (lookahead_size <= nr_to_read) { unsigned long ra_folio_index; ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, min_nrpages); mark = ra_folio_index - index; } nr_to_read += readahead_index(ractl) - index; ractl->_index = index; /* * Preallocate as many pages as we will need. */ while (i < nr_to_read) { struct folio *folio = xa_load(&mapping->i_pages, index + i); int ret; if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the * next batch. This page may be the one we would * have intended to mark as Readahead, but we don't * have a stable reference to this page, and it's * not worth getting one just for that. */ read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } folio = ractl_alloc_folio(ractl, gfp_mask, mapping_min_folio_order(mapping)); if (!folio) break; ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); if (ret < 0) { folio_put(folio); if (ret == -ENOMEM) break; read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } if (i == mark) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages += min_nrpages; i += min_nrpages; } /* * Now start the IO. We ignore I/O errors - if the folio is not * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; end_index = (isize - 1) >> PAGE_SHIFT; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages; if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; do_page_cache_ra(ractl, this_chunk, 0); nr_to_read -= this_chunk; } } /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* * Get the previous window size, ramp it up, and * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, unsigned long max) { unsigned long cur = ra->size; if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur; return max; } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * * |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead * * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; * Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. * * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead * page at (start+size-async_size) with PG_readahead, and use it as readahead * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) { folio_put(folio); return err; } ractl->_nr_pages += 1UL << order; ractl->_workingset |= folio_test_workingset(folio); return 0; } void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra) { struct address_space *mapping = ractl->mapping; pgoff_t start = readahead_index(ractl); pgoff_t index = start; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); unsigned int new_order = ra->order; trace_page_cache_ra_order(mapping->host, start, ra); if (!mapping_large_folio_support(mapping)) { ra->order = 0; goto fallback; } limit = min(limit, index + ra->size - 1); new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); ra->order = new_order; /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); /* * If the new_order is greater than min_order and index is * already aligned to new_order, then this will be noop as index * aligned to new_order should also be aligned to min_order. */ ractl->_index = mapping_align_index(mapping, index); index = readahead_index(ractl); while (index <= limit) { unsigned int order = new_order; /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ while (order > min_order && index + (1UL << order) - 1 > limit) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; index += 1UL << order; } read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this * situation below. */ if (!err) return; fallback: /* * ->readahead() may have updated readahead window size so we have to * check there's still something to read. */ if (ra->size > index - start) do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); unsigned long max_pages = ractl->ra->ra_pages; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages); return max_pages; } void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { pgoff_t index = readahead_index(ractl); bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); struct file_ra_state *ra = ractl->ra; unsigned long max_pages, contig_count; pgoff_t prev_index, miss; trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count); /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; do_forced_ra = true; } /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); return; } max_pages = ractl_max_pages(ractl, req_count); prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; /* * A start of file, oversized read, or sequential cache miss: * trivial case: (index - prev_index) == 1 * unaligned reads: (index - prev_index) == 0 */ if (!index || req_count > max_pages || index - prev_index <= 1UL) { ra->start = index; ra->size = get_init_ra_size(req_count, max_pages); ra->async_size = ra->size > req_count ? ra->size - req_count : ra->size >> 1; goto readit; } /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ rcu_read_lock(); miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages); rcu_read_unlock(); contig_count = index - miss - 1; /* * Standalone, small random read. Read as is, and do not pollute the * readahead state. */ if (contig_count <= req_count) { do_page_cache_ra(ractl, req_count, 0); return; } /* * File cached from the beginning: * it is a strong indication of long-run stream (or whole-file-read) */ if (miss == ULONG_MAX) contig_count *= 2; ra->start = index; ra->size = min(contig_count + req_count, max_pages); ra->async_size = 1; readit: ra->order = 0; ractl->_index = ra->start; page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { unsigned long max_pages; struct file_ra_state *ra = ractl->ra; pgoff_t index = readahead_index(ractl); pgoff_t expected, start, end, aligned_end, align; /* no readahead */ if (!ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (folio_test_writeback(folio)) return; trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count); folio_clear_readahead(folio); if (blk_cgroup_congested()) return; max_pages = ractl_max_pages(ractl, req_count); /* * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, folio_nr_pages(folio)); if (index == expected) { ra->start += ra->size; /* * In the case of MADV_HUGEPAGE, the actual size might exceed * the readahead window. */ ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); goto readit; } /* * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_count; ra->size = get_next_ra_size(ra, max_pages); readit: ra->order += 2; align = 1UL << min(ra->order, ffs(max_pages) - 1); end = ra->start + ra->size; aligned_end = round_down(end, align); if (aligned_end > ra->start) ra->size -= end - aligned_end; ra->async_size = ra->size; ractl->_index = ra->start; page_cache_ra_order(ractl, ra); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { struct file *file; const struct inode *inode; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; file = fd_file(f); if (!(file->f_mode & FMODE_READ)) return -EBADF; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ if (!file->f_mapping) return -EINVAL; if (!file->f_mapping->a_ops) return -EINVAL; inode = file_inode(file); if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -EINVAL; if (IS_ANON_FILE(inode)) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) { return ksys_readahead(fd, compat_arg_u64_glue(offset), count); } #endif /** * readahead_expand - Expand a readahead request * @ractl: The request to be expanded * @new_start: The revised start * @new_len: The revised size of the request * * Attempt to expand a readahead request outwards from the current size to the * specified size by inserting locked pages before and after the current window * to increase the size to the new window. This may involve the insertion of * THPs, in which case the window may get expanded even beyond what was * requested. * * The algorithm will stop if it encounters a conflicting page already in the * pagecache and leave a smaller expansion than requested. * * The caller must check for this by examining the revised @ractl object for a * different expansion than was requested. */ void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long min_nrpages = mapping_min_folio_nrpages(mapping); unsigned int min_order = mapping_min_folio_order(mapping); new_index = new_start / PAGE_SIZE; /* * Readahead code should have aligned the ractl->_index to * min_nrpages before calling readahead aops. */ VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages)); /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; if (ra) { ra->size += min_nrpages; ra->async_size += min_nrpages; } } } EXPORT_SYMBOL(readahead_expand);
10 2 23 1 17 1 69 23 3 6 17 3 2 21 18 12 18 10 1 10 3 3 10 8 15713 136 44 171 48 144 90 15710 477 15594 5747 9964 9906 175 461 167 3 11 1 1 4 31 15034 88 15086 82 15140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * VLAN An implementation of 802.1Q VLAN tagging. * * Authors: Ben Greear <greearb@candelatech.com> */ #ifndef _LINUX_IF_VLAN_H_ #define _LINUX_IF_VLAN_H_ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/bug.h> #include <uapi/linux/if_vlan.h> #define VLAN_HLEN 4 /* The additional bytes required by VLAN * (in addition to the Ethernet header) */ #define VLAN_ETH_HLEN 18 /* Total octets in header. */ #define VLAN_ETH_ZLEN 64 /* Min. octets in frame sans FCS */ /* * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan */ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ #define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; /** * struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr) * @h_dest: destination ethernet address * @h_source: source ethernet address * @h_vlan_proto: ethernet protocol * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { struct_group(addrs, unsigned char h_dest[ETH_ALEN]; unsigned char h_source[ETH_ALEN]; ); __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; #include <linux/skbuff.h> static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) { return (struct vlan_ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + vlan_eth_hdr() */ static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb) { return (struct vlan_ethhdr *)skb->data; } #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */ #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ #define VLAN_N_VID 4096 /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); #define skb_vlan_tag_present(__skb) (!!(__skb)->vlan_all) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) #define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev); } static inline int vlan_get_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev); } /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats * @rx_packets: number of received packets * @rx_bytes: number of received bytes * @rx_multicast: number of received multicast packets * @tx_packets: number of transmitted packets * @tx_bytes: number of transmitted bytes * @syncp: synchronization point for 64bit counters * @rx_errors: number of rx errors * @tx_dropped: number of tx drops */ struct vlan_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_errors; u32 tx_dropped; }; #if IS_ENABLED(CONFIG_VLAN_8021Q) extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id); extern int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg); extern struct net_device *vlan_dev_real_dev(const struct net_device *dev); extern u16 vlan_dev_vlan_id(const struct net_device *dev); extern __be16 vlan_dev_vlan_proto(const struct net_device *dev); /** * struct vlan_priority_tci_mapping - vlan egress priority mappings * @priority: skb priority * @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000 * @next: pointer to next struct */ struct vlan_priority_tci_mapping { u32 priority; u16 vlan_qos; struct vlan_priority_tci_mapping *next; }; struct proc_dir_entry; struct netpoll; /** * struct vlan_dev_priv - VLAN private device data * @nr_ingress_mappings: number of ingress priority mappings * @ingress_priority_map: ingress priority mappings * @nr_egress_mappings: number of egress priority mappings * @egress_priority_map: hash of egress priority mappings * @vlan_proto: VLAN encapsulation protocol * @vlan_id: VLAN identifier * @flags: device flags * @real_dev: underlying netdevice * @dev_tracker: refcount tracker for @real_dev reference * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats * @netpoll: netpoll instance "propagated" down to @real_dev */ struct vlan_dev_priv { unsigned int nr_ingress_mappings; u32 ingress_priority_map[8]; unsigned int nr_egress_mappings; struct vlan_priority_tci_mapping *egress_priority_map[16]; __be16 vlan_proto; u16 vlan_id; u16 flags; struct net_device *real_dev; netdevice_tracker dev_tracker; unsigned char real_dev_addr[ETH_ALEN]; struct proc_dir_entry *dent; struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif }; static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) { return netdev_priv(dev); } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { struct vlan_priority_tci_mapping *mp; smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */ mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)]; while (mp) { if (mp->priority == skprio) { return mp->vlan_qos; /* This should already be shifted * to mask correctly with the * VLAN's TCI */ } mp = mp->next; } return 0; } extern bool vlan_do_receive(struct sk_buff **skb); extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid); extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid); extern int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev); extern void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev); extern bool vlan_uses_dev(const struct net_device *dev); #else static inline bool is_vlan_dev(const struct net_device *dev) { return false; } static inline struct net_device * __vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id) { return NULL; } static inline int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg) { return 0; } static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev) { WARN_ON_ONCE(1); return NULL; } static inline u16 vlan_dev_vlan_id(const struct net_device *dev) { WARN_ON_ONCE(1); return 0; } static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev) { WARN_ON_ONCE(1); return 0; } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { return 0; } static inline bool vlan_do_receive(struct sk_buff **skb) { return false; } static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid) { return 0; } static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid) { } static inline int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev) { return 0; } static inline void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev) { } static inline bool vlan_uses_dev(const struct net_device *dev) { return false; } #endif /** * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * * Returns: true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { switch (ethertype) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX) return true; if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX) return true; return false; } /** * __vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Does not change skb->protocol so this function can be used during receive. * * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { struct vlan_ethhdr *veth; if (skb_cow_head(skb, VLAN_HLEN) < 0) return -ENOMEM; skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); if (skb_mac_header_was_set(skb)) skb->mac_header -= VLAN_HLEN; veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); /* first, the ethernet type */ if (likely(mac_len >= ETH_TLEN)) { /* h_vlan_encapsulated_proto should already be populated, and * skb->data has space for h_vlan_proto */ veth->h_vlan_proto = vlan_proto; } else { /* h_vlan_encapsulated_proto should not be populated, and * skb->data has no space for h_vlan_proto */ veth->h_vlan_encapsulated_proto = skb->protocol; } /* now, the TCI */ veth->h_vlan_TCI = htons(vlan_tci); return 0; } /** * __vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Does not change skb->protocol so this function can be used during receive. * * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. * * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { int err; err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len); if (err) { dev_kfree_skb_any(skb); return NULL; } return skb; } /** * vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. * * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_tag_set_proto - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. This might change skb->head. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb = vlan_insert_tag(skb, vlan_proto, vlan_tci); if (skb) skb->protocol = vlan_proto; return skb; } /** * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info * @skb: skbuff to clear * * Clears the VLAN information from @skb */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { skb->vlan_all = 0; } /** * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb * @dst: skbuff to copy to * @src: skbuff to copy from * * Copies VLAN information from @src to @dst (for branchless code) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { dst->vlan_all = src->vlan_all; } /* * __vlan_hwaccel_push_inside - pushes vlan tag to the payload * @skb: skbuff to tag * * Pushes the VLAN tag from @skb->vlan_tci inside to the payload. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. */ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) { skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (likely(skb)) __vlan_hwaccel_clear_tag(skb); return skb; } /** * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest */ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; } /** * __vlan_get_tag - get the VLAN ID that is part of the payload * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns: error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) return -ENODATA; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; } /** * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[] * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns: error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb_vlan_tag_present(skb)) { *vlan_tci = skb_vlan_tag_get(skb); return 0; } else { *vlan_tci = 0; return -ENODATA; } } /** * vlan_get_tag - get the VLAN ID from the skb * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns: error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) { return __vlan_hwaccel_get_tag(skb, vlan_tci); } else { return __vlan_get_tag(skb, vlan_tci); } } /** * __vlan_get_protocol_offset() - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, __be16 type, int mac_offset, int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at * ETH_HLEN otherwise */ if (eth_type_vlan(type)) { if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; vlan_depth -= VLAN_HLEN; } else { vlan_depth = ETH_HLEN; } do { struct vlan_hdr vhdr, *vh; vh = skb_header_pointer(skb, mac_offset + vlan_depth, sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); } if (depth) *depth = vlan_depth; return type; } static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { return __vlan_get_protocol_offset(skb, type, 0, depth); } /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } /* This version of __vlan_get_protocol() also pulls mac header in skb->head */ static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb, __be16 type, int *depth) { int maclen; type = __vlan_get_protocol(skb, type, &maclen); if (type) { if (!pskb_may_pull(skb, maclen)) type = 0; else if (depth) *depth = maclen; } return type; } /* A getter for the SKB protocol field which will handle VLAN tags consistently * whether VLAN acceleration is enabled or not. */ static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) { if (!skip_vlan) /* VLAN acceleration strips the VLAN header from the skb and * moves it to skb->vlan_proto */ return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; return vlan_get_protocol(skb); } static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { __be16 proto; unsigned short *rawp; /* * Was a VLAN packet, grab the encapsulated protocol, which the layer * three protocols care about. */ proto = vhdr->h_vlan_encapsulated_proto; if (eth_proto_is_802_3(proto)) { skb->protocol = proto; return; } rawp = (unsigned short *)(vhdr + 1); if (*rawp == 0xFFFF) /* * This is a magic hack to spot IPX packets. Older Novell * breaks the protocol design and runs IPX over 802.3 without * an 802.2 LLC layer. We look for FFFF which isn't a used * 802.2 SSAP/DSAP. This won't work for fault tolerant netware * but does for the rest. */ skb->protocol = htons(ETH_P_802_3); else /* * Real 802.2 LLC */ skb->protocol = htons(ETH_P_802_2); } /** * vlan_remove_tag - remove outer VLAN tag from payload * @skb: skbuff to remove tag from * @vlan_tci: buffer to store value * * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. * * Returns: a new pointer to skb->data, or NULL on failure to pull. */ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); *vlan_tci = ntohs(vhdr->h_vlan_TCI); memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); return __skb_pull(skb, VLAN_HLEN); } /** * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * * Returns: true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) { if (!skb_vlan_tag_present(skb) && likely(!eth_type_vlan(skb->protocol))) return false; return true; } /** * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * * Returns: true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) { __be16 protocol = skb->protocol; if (!skb_vlan_tag_present(skb)) { struct vlan_ethhdr *veh; if (likely(!eth_type_vlan(protocol))) return false; if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) return false; veh = skb_vlan_eth_hdr(skb); protocol = veh->h_vlan_encapsulated_proto; } if (!eth_type_vlan(protocol)) return false; return true; } /** * vlan_features_check - drop unsafe features for skb with multiple tags. * @skb: skbuff to query * @features: features to be checked * * Returns: features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tagged_multi(skb)) { /* In the case of multi-tagged packets, use a direct mask * instead of using netdev_interesect_features(), to make * sure that only devices supporting NETIF_F_HW_CSUM will * have checksum offloading support. */ features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } return features; } /** * compare_vlan_header - Compare two vlan headers * @h1: Pointer to vlan header * @h2: Pointer to vlan header * * Compare two vlan headers. * * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. * * Return: 0 if equal, arbitrary non-zero value if not equal. */ static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, const struct vlan_hdr *h2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return *(u32 *)h1 ^ *(u32 *)h2; #else return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) | ((__force u32)h1->h_vlan_encapsulated_proto ^ (__force u32)h2->h_vlan_encapsulated_proto); #endif } #endif /* !(_LINUX_IF_VLAN_H_) */
20 290 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_LABELS_H #define _NF_CONNTRACK_LABELS_H #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <uapi/linux/netfilter/xt_connlabel.h> #define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) struct nf_conn_labels { unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)]; }; /* Can't use nf_ct_ext_find(), flow dissector cannot use symbols * exported by nf_conntrack module. */ static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS struct nf_ct_ext *ext = ct->ext; if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS)) return NULL; return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS]; #else return NULL; #endif } static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_LABELS struct net *net = nf_ct_net(ct); if (atomic_read(&net->ct.labels_used) == 0) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC); #else return NULL; #endif } int nf_connlabels_replace(struct nf_conn *ct, const u32 *data, const u32 *mask, unsigned int words); #ifdef CONFIG_NF_CONNTRACK_LABELS int nf_connlabels_get(struct net *net, unsigned int bit); void nf_connlabels_put(struct net *net); #else static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; } static inline void nf_connlabels_put(struct net *net) {} #endif #endif /* _NF_CONNTRACK_LABELS_H */
9 8 2 9 1 7 2 2 7 6 1 1 84 84 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 // SPDX-License-Identifier: GPL-2.0-or-later /* * Misc and compatibility things * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/fs.h> #include <sound/core.h> void release_and_free_resource(struct resource *res) { if (res) { release_resource(res); kfree(res); } } EXPORT_SYMBOL(release_and_free_resource); #ifdef CONFIG_PCI #include <linux/pci.h> /** * snd_pci_quirk_lookup_id - look up a PCI SSID quirk list * @vendor: PCI SSV id * @device: PCI SSD id * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list) { const struct snd_pci_quirk *q; for (q = list; q->subvendor || q->subdevice; q++) { if (q->subvendor != vendor) continue; if (!q->subdevice || (device & q->subdevice_mask) == q->subdevice) return q; } return NULL; } EXPORT_SYMBOL(snd_pci_quirk_lookup_id); /** * snd_pci_quirk_lookup - look up a PCI SSID quirk list * @pci: pci_dev handle * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list) { if (!pci) return NULL; return snd_pci_quirk_lookup_id(pci->subsystem_vendor, pci->subsystem_device, list); } EXPORT_SYMBOL(snd_pci_quirk_lookup); #endif /* * Deferred async signal helpers * * Below are a few helper functions to wrap the async signal handling * in the deferred work. The main purpose is to avoid the messy deadlock * around tasklist_lock and co at the kill_fasync() invocation. * fasync_helper() and kill_fasync() are replaced with snd_fasync_helper() * and snd_kill_fasync(), respectively. In addition, snd_fasync_free() has * to be called at releasing the relevant file object. */ struct snd_fasync { struct fasync_struct *fasync; int signal; int poll; int on; struct list_head list; }; static DEFINE_SPINLOCK(snd_fasync_lock); static LIST_HEAD(snd_fasync_list); static void snd_fasync_work_fn(struct work_struct *work) { struct snd_fasync *fasync; spin_lock_irq(&snd_fasync_lock); while (!list_empty(&snd_fasync_list)) { fasync = list_first_entry(&snd_fasync_list, struct snd_fasync, list); list_del_init(&fasync->list); spin_unlock_irq(&snd_fasync_lock); if (fasync->on) kill_fasync(&fasync->fasync, fasync->signal, fasync->poll); spin_lock_irq(&snd_fasync_lock); } spin_unlock_irq(&snd_fasync_lock); } static DECLARE_WORK(snd_fasync_work, snd_fasync_work_fn); int snd_fasync_helper(int fd, struct file *file, int on, struct snd_fasync **fasyncp) { struct snd_fasync *fasync = NULL; if (on) { fasync = kzalloc(sizeof(*fasync), GFP_KERNEL); if (!fasync) return -ENOMEM; INIT_LIST_HEAD(&fasync->list); } scoped_guard(spinlock_irq, &snd_fasync_lock) { if (*fasyncp) { kfree(fasync); fasync = *fasyncp; } else { if (!fasync) return 0; *fasyncp = fasync; } fasync->on = on; } return fasync_helper(fd, file, on, &fasync->fasync); } EXPORT_SYMBOL_GPL(snd_fasync_helper); void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll) { if (!fasync || !fasync->on) return; guard(spinlock_irqsave)(&snd_fasync_lock); fasync->signal = signal; fasync->poll = poll; list_move(&fasync->list, &snd_fasync_list); schedule_work(&snd_fasync_work); } EXPORT_SYMBOL_GPL(snd_kill_fasync); void snd_fasync_free(struct snd_fasync *fasync) { if (!fasync) return; fasync->on = 0; flush_work(&snd_fasync_work); kfree(fasync); } EXPORT_SYMBOL_GPL(snd_fasync_free);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Mellanox Technologies. All rights reserved */ #include <linux/debugfs.h> #include <linux/err.h> #include <linux/etherdevice.h> #include <linux/inet.h> #include <linux/kernel.h> #include <linux/random.h> #include <linux/slab.h> #include <net/devlink.h> #include <net/ip.h> #include <net/psample.h> #include <uapi/linux/ip.h> #include <uapi/linux/udp.h> #include "netdevsim.h" #define NSIM_PSAMPLE_REPORT_INTERVAL_MS 100 #define NSIM_PSAMPLE_INVALID_TC 0xFFFF #define NSIM_PSAMPLE_L4_DATA_LEN 100 struct nsim_dev_psample { struct delayed_work psample_dw; struct dentry *ddir; struct psample_group *group; u32 rate; u32 group_num; u32 trunc_size; int in_ifindex; int out_ifindex; u16 out_tc; u64 out_tc_occ_max; u64 latency_max; bool is_active; }; static struct sk_buff *nsim_dev_psample_skb_build(void) { int tot_len, data_len = NSIM_PSAMPLE_L4_DATA_LEN; struct sk_buff *skb; struct udphdr *udph; struct ethhdr *eth; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; tot_len = sizeof(struct iphdr) + sizeof(struct udphdr) + data_len; skb_reset_mac_header(skb); eth = skb_put(skb, sizeof(struct ethhdr)); eth_random_addr(eth->h_dest); eth_random_addr(eth->h_source); eth->h_proto = htons(ETH_P_IP); skb->protocol = htons(ETH_P_IP); skb_set_network_header(skb, skb->len); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = IPPROTO_UDP; iph->saddr = in_aton("192.0.2.1"); iph->daddr = in_aton("198.51.100.1"); iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; iph->tot_len = htons(tot_len); iph->id = 0; iph->ttl = 100; iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb_set_transport_header(skb, skb->len); udph = skb_put_zero(skb, sizeof(struct udphdr) + data_len); get_random_bytes(&udph->source, sizeof(u16)); get_random_bytes(&udph->dest, sizeof(u16)); udph->len = htons(sizeof(struct udphdr) + data_len); return skb; } static void nsim_dev_psample_md_prepare(const struct nsim_dev_psample *psample, struct psample_metadata *md, unsigned int len) { md->trunc_size = psample->trunc_size ? psample->trunc_size : len; md->in_ifindex = psample->in_ifindex; md->out_ifindex = psample->out_ifindex; if (psample->out_tc != NSIM_PSAMPLE_INVALID_TC) { md->out_tc = psample->out_tc; md->out_tc_valid = 1; } if (psample->out_tc_occ_max) { u64 out_tc_occ; get_random_bytes(&out_tc_occ, sizeof(u64)); md->out_tc_occ = out_tc_occ & (psample->out_tc_occ_max - 1); md->out_tc_occ_valid = 1; } if (psample->latency_max) { u64 latency; get_random_bytes(&latency, sizeof(u64)); md->latency = latency & (psample->latency_max - 1); md->latency_valid = 1; } } static void nsim_dev_psample_report_work(struct work_struct *work) { struct nsim_dev_psample *psample; struct psample_metadata md = {}; struct sk_buff *skb; unsigned long delay; psample = container_of(work, struct nsim_dev_psample, psample_dw.work); skb = nsim_dev_psample_skb_build(); if (!skb) goto out; nsim_dev_psample_md_prepare(psample, &md, skb->len); psample_sample_packet(psample->group, skb, psample->rate, &md); consume_skb(skb); out: delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS); schedule_delayed_work(&psample->psample_dw, delay); } static int nsim_dev_psample_enable(struct nsim_dev *nsim_dev) { struct nsim_dev_psample *psample = nsim_dev->psample; struct devlink *devlink; unsigned long delay; if (psample->is_active) return -EBUSY; devlink = priv_to_devlink(nsim_dev); psample->group = psample_group_get(devlink_net(devlink), psample->group_num); if (!psample->group) return -EINVAL; delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS); schedule_delayed_work(&psample->psample_dw, delay); psample->is_active = true; return 0; } static int nsim_dev_psample_disable(struct nsim_dev *nsim_dev) { struct nsim_dev_psample *psample = nsim_dev->psample; if (!psample->is_active) return -EINVAL; psample->is_active = false; cancel_delayed_work_sync(&psample->psample_dw); psample_group_put(psample->group); return 0; } static ssize_t nsim_dev_psample_enable_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; bool enable; int err; err = kstrtobool_from_user(data, count, &enable); if (err) return err; if (enable) err = nsim_dev_psample_enable(nsim_dev); else err = nsim_dev_psample_disable(nsim_dev); return err ? err : count; } static const struct file_operations nsim_psample_enable_fops = { .open = simple_open, .write = nsim_dev_psample_enable_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; int nsim_dev_psample_init(struct nsim_dev *nsim_dev) { struct nsim_dev_psample *psample; int err; psample = kzalloc(sizeof(*psample), GFP_KERNEL); if (!psample) return -ENOMEM; nsim_dev->psample = psample; INIT_DELAYED_WORK(&psample->psample_dw, nsim_dev_psample_report_work); psample->ddir = debugfs_create_dir("psample", nsim_dev->ddir); if (IS_ERR(psample->ddir)) { err = PTR_ERR(psample->ddir); goto err_psample_free; } /* Populate sampling parameters with sane defaults. */ psample->rate = 100; debugfs_create_u32("rate", 0600, psample->ddir, &psample->rate); psample->group_num = 10; debugfs_create_u32("group_num", 0600, psample->ddir, &psample->group_num); psample->trunc_size = 0; debugfs_create_u32("trunc_size", 0600, psample->ddir, &psample->trunc_size); psample->in_ifindex = 1; debugfs_create_u32("in_ifindex", 0600, psample->ddir, &psample->in_ifindex); psample->out_ifindex = 2; debugfs_create_u32("out_ifindex", 0600, psample->ddir, &psample->out_ifindex); psample->out_tc = 0; debugfs_create_u16("out_tc", 0600, psample->ddir, &psample->out_tc); psample->out_tc_occ_max = 10000; debugfs_create_u64("out_tc_occ_max", 0600, psample->ddir, &psample->out_tc_occ_max); psample->latency_max = 50; debugfs_create_u64("latency_max", 0600, psample->ddir, &psample->latency_max); debugfs_create_file("enable", 0200, psample->ddir, nsim_dev, &nsim_psample_enable_fops); return 0; err_psample_free: kfree(nsim_dev->psample); return err; } void nsim_dev_psample_exit(struct nsim_dev *nsim_dev) { debugfs_remove_recursive(nsim_dev->psample->ddir); if (nsim_dev->psample->is_active) { cancel_delayed_work_sync(&nsim_dev->psample->psample_dw); psample_group_put(nsim_dev->psample->group); } kfree(nsim_dev->psample); }
4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Public Key Signature Algorithm * * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/sig.h> #include <linux/cryptouser.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" static void crypto_sig_exit_tfm(struct crypto_tfm *tfm) { struct crypto_sig *sig = __crypto_sig_tfm(tfm); struct sig_alg *alg = crypto_sig_alg(sig); alg->exit(sig); } static int crypto_sig_init_tfm(struct crypto_tfm *tfm) { struct crypto_sig *sig = __crypto_sig_tfm(tfm); struct sig_alg *alg = crypto_sig_alg(sig); if (alg->exit) sig->base.exit = crypto_sig_exit_tfm; if (alg->init) return alg->init(sig); return 0; } static void crypto_sig_free_instance(struct crypto_instance *inst) { struct sig_instance *sig = sig_instance(inst); sig->free(sig); } static void __maybe_unused crypto_sig_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : sig\n"); } static int __maybe_unused crypto_sig_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_sig rsig = {}; strscpy(rsig.type, "sig", sizeof(rsig.type)); return nla_put(skb, CRYPTOCFGA_REPORT_SIG, sizeof(rsig), &rsig); } static const struct crypto_type crypto_sig_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_sig_init_tfm, .free = crypto_sig_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_sig_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_sig_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SIG, .tfmsize = offsetof(struct crypto_sig, base), .algsize = offsetof(struct sig_alg, base), }; struct crypto_sig *crypto_alloc_sig(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_sig_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_sig); static int sig_default_sign(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { return -ENOSYS; } static int sig_default_verify(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *dst, unsigned int dlen) { return -ENOSYS; } static int sig_default_set_key(struct crypto_sig *tfm, const void *key, unsigned int keylen) { return -ENOSYS; } static unsigned int sig_default_size(struct crypto_sig *tfm) { return DIV_ROUND_UP_POW2(crypto_sig_keysize(tfm), BITS_PER_BYTE); } static int sig_prepare_alg(struct sig_alg *alg) { struct crypto_alg *base = &alg->base; if (!alg->sign) alg->sign = sig_default_sign; if (!alg->verify) alg->verify = sig_default_verify; if (!alg->set_priv_key) alg->set_priv_key = sig_default_set_key; if (!alg->set_pub_key) return -EINVAL; if (!alg->key_size) return -EINVAL; if (!alg->max_size) alg->max_size = sig_default_size; if (!alg->digest_size) alg->digest_size = sig_default_size; base->cra_type = &crypto_sig_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_SIG; return 0; } int crypto_register_sig(struct sig_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = sig_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_sig); void crypto_unregister_sig(struct sig_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_sig); int sig_register_instance(struct crypto_template *tmpl, struct sig_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = sig_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, sig_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(sig_register_instance); int crypto_grab_sig(struct crypto_sig_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_sig_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_sig); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Public Key Signature Algorithms");
30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. */ #ifndef __DIR_DOT_H__ #define __DIR_DOT_H__ #include <linux/dcache.h> #include <linux/crc32.h> struct inode; struct gfs2_inode; struct gfs2_inum; struct buffer_head; struct gfs2_dirent; struct gfs2_diradd { unsigned nr_blocks; struct gfs2_dirent *dent; struct buffer_head *bh; int save_loc; }; struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename, bool fail_on_exist); int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); int gfs2_dir_add(struct inode *inode, const struct qstr *filename, const struct gfs2_inode *ip, struct gfs2_diradd *da); static inline void gfs2_dir_no_add(struct gfs2_diradd *da) { brelse(da->bh); da->bh = NULL; } int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, struct file_ra_state *f_ra); int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, const struct gfs2_inode *nip, unsigned int new_type); int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); int gfs2_diradd_alloc_required(struct inode *dir, const struct qstr *filename, struct gfs2_diradd *da); int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp); void gfs2_dir_hash_inval(struct gfs2_inode *ip); static inline u32 gfs2_disk_hash(const char *data, int len) { return crc32_le((u32)~0, data, len) ^ (u32)~0; } static inline void gfs2_str2qstr(struct qstr *name, const char *fname) { name->name = fname; name->len = strlen(fname); name->hash = gfs2_disk_hash(name->name, name->len); } /* N.B. This probably ought to take inum & type as args as well */ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent) { dent->de_inum.no_addr = cpu_to_be64(0); dent->de_inum.no_formal_ino = cpu_to_be64(0); dent->de_hash = cpu_to_be32(name->hash); dent->de_rec_len = cpu_to_be16(reclen); dent->de_name_len = cpu_to_be16(name->len); dent->de_type = cpu_to_be16(0); memset(dent->__pad, 0, sizeof(dent->__pad)); memcpy(dent + 1, name->name, name->len); } extern struct qstr gfs2_qdot; extern struct qstr gfs2_qdotdot; #endif /* __DIR_DOT_H__ */
4 3 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved */ #include "autofs_i.h" static const char *autofs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct autofs_sb_info *sbi; struct autofs_info *ino; if (!dentry) return ERR_PTR(-ECHILD); sbi = autofs_sbi(dentry->d_sb); ino = autofs_dentry_ino(dentry); if (ino && !autofs_oz_mode(sbi)) ino->last_used = jiffies; return d_inode(dentry)->i_private; } const struct inode_operations autofs_symlink_inode_operations = { .get_link = autofs_get_link };
64 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0 #include <net/genetlink.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static const struct genl_ops ila_nl_ops[] = { { .cmd = ILA_CMD_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_add_mapping, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_del_mapping, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_flush, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_get_mapping, .start = ila_xlat_nl_dump_start, .dumpit = ila_xlat_nl_dump, .done = ila_xlat_nl_dump_done, }, }; unsigned int ila_net_id; struct genl_family ila_nl_family __ro_after_init = { .hdrsize = 0, .name = ILA_GENL_NAME, .version = ILA_GENL_VERSION, .maxattr = ILA_ATTR_MAX, .policy = ila_nl_policy, .netnsok = true, .parallel_ops = true, .module = THIS_MODULE, .ops = ila_nl_ops, .n_ops = ARRAY_SIZE(ila_nl_ops), .resv_start_op = ILA_CMD_FLUSH + 1, }; static __net_init int ila_init_net(struct net *net) { int err; err = ila_xlat_init_net(net); if (err) goto ila_xlat_init_fail; return 0; ila_xlat_init_fail: return err; } static __net_exit void ila_pre_exit_net(struct net *net) { ila_xlat_pre_exit_net(net); } static __net_exit void ila_exit_net(struct net *net) { ila_xlat_exit_net(net); } static struct pernet_operations ila_net_ops = { .init = ila_init_net, .pre_exit = ila_pre_exit_net, .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), }; static int __init ila_init(void) { int ret; ret = register_pernet_device(&ila_net_ops); if (ret) goto register_device_fail; ret = genl_register_family(&ila_nl_family); if (ret) goto register_family_fail; ret = ila_lwt_init(); if (ret) goto fail_lwt; return 0; fail_lwt: genl_unregister_family(&ila_nl_family); register_family_fail: unregister_pernet_device(&ila_net_ops); register_device_fail: return ret; } static void __exit ila_fini(void) { ila_lwt_fini(); genl_unregister_family(&ila_nl_family); unregister_pernet_device(&ila_net_ops); } module_init(ila_init); module_exit(ila_fini); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6: Identifier Locator Addressing (ILA)");
2128 7668 473 222 45 3 35 52 464 48 342 160 161 2 3880 196 2093 35 7638 7647 2869 2630 462 6 814 1056 12 5 1 6021 921 13051 1426 4 30 8 32 2029 2044 6 2350 272 26 2466 814 44 1082 15 2 2 200 29 6 67 54 4 10 518 726 26 93 3 29 5 4 3 17 21 40 19 2 2 5 1 1 1 167 25 10 55 272 1 2 3436 83 713 2275 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NETLINK_H #define __NET_NETLINK_H #include <linux/types.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/in6.h> /* ======================================================================== * Netlink Messages and Attributes Interface (As Seen On TV) * ------------------------------------------------------------------------ * Messages Interface * ------------------------------------------------------------------------ * * Message Format: * <--- nlmsg_total_size(payload) ---> * <-- nlmsg_msg_size(payload) -> * +----------+- - -+-------------+- - -+-------- - - * | nlmsghdr | Pad | Payload | Pad | nlmsghdr * +----------+- - -+-------------+- - -+-------- - - * nlmsg_data(nlh)---^ ^ * nlmsg_next(nlh)-----------------------+ * * Payload Format: * <---------------------- nlmsg_len(nlh) ---------------------> * <------ hdrlen ------> <- nlmsg_attrlen(nlh, hdrlen) -> * +----------------------+- - -+--------------------------------+ * | Family Header | Pad | Attributes | * +----------------------+- - -+--------------------------------+ * nlmsg_attrdata(nlh, hdrlen)---^ * * Data Structures: * struct nlmsghdr netlink message header * * Message Construction: * nlmsg_new() create a new netlink message * nlmsg_put() add a netlink message to an skb * nlmsg_put_answer() callback based nlmsg_put() * nlmsg_end() finalize netlink message * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction * nlmsg_consume() free a netlink message (expected) * nlmsg_free() free a netlink message (drop) * * Message Sending: * nlmsg_multicast() multicast message to several groups * nlmsg_unicast() unicast a message to a single socket * nlmsg_notify() send notification message * * Message Length Calculations: * nlmsg_msg_size(payload) length of message w/o padding * nlmsg_total_size(payload) length of message w/ padding * nlmsg_padlen(payload) length of padding at tail * * Message Payload Access: * nlmsg_data(nlh) head of message payload * nlmsg_len(nlh) length of message payload * nlmsg_attrdata(nlh, hdrlen) head of attributes data * nlmsg_attrlen(nlh, hdrlen) length of attributes data * * Message Parsing: * nlmsg_ok(nlh, remaining) does nlh fit into remaining bytes? * nlmsg_next(nlh, remaining) get next netlink message * nlmsg_parse() parse attributes of a message * nlmsg_find_attr() find an attribute in a message * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes * nlmsg_for_each_attr_type() loop over all attributes with the * given type * * Misc: * nlmsg_report() report back to application? * * ------------------------------------------------------------------------ * Attributes Interface * ------------------------------------------------------------------------ * * Attribute Format: * <------- nla_total_size(payload) -------> * <---- nla_attr_size(payload) -----> * +----------+- - -+- - - - - - - - - +- - -+-------- - - * | Header | Pad | Payload | Pad | Header * +----------+- - -+- - - - - - - - - +- - -+-------- - - * <- nla_len(nla) -> ^ * nla_data(nla)----^ | * nla_next(nla)-----------------------------' * * Data Structures: * struct nlattr netlink attribute header * * Attribute Construction: * nla_reserve(skb, type, len) reserve room for an attribute * nla_reserve_nohdr(skb, len) reserve room for an attribute w/o hdr * nla_put(skb, type, len, data) add attribute to skb * nla_put_nohdr(skb, len, data) add attribute w/o hdr * nla_append(skb, len, data) append data to skb * * Attribute Construction for Basic Types: * nla_put_u8(skb, type, value) add u8 attribute to skb * nla_put_u16(skb, type, value) add u16 attribute to skb * nla_put_u32(skb, type, value) add u32 attribute to skb * nla_put_u64_64bit(skb, type, * value, padattr) add u64 attribute to skb * nla_put_s8(skb, type, value) add s8 attribute to skb * nla_put_s16(skb, type, value) add s16 attribute to skb * nla_put_s32(skb, type, value) add s32 attribute to skb * nla_put_s64(skb, type, value, * padattr) add s64 attribute to skb * nla_put_string(skb, type, str) add string attribute to skb * nla_put_flag(skb, type) add flag attribute to skb * nla_put_msecs(skb, type, jiffies, * padattr) add msecs attribute to skb * nla_put_in_addr(skb, type, addr) add IPv4 address attribute to skb * nla_put_in6_addr(skb, type, addr) add IPv6 address attribute to skb * * Nested Attributes Construction: * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction * nla_put_empty_nest(skb, type) create an empty nest * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding * nla_total_size(payload) length of attribute w/ padding * nla_padlen(payload) length of padding * * Attribute Payload Access: * nla_data(nla) head of attribute payload * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: * nla_get_uint(nla) get payload for a uint attribute * nla_get_sint(nla) get payload for a sint attribute * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute * nla_get_u64(nla) get payload for a u64 attribute * nla_get_s8(nla) get payload for a s8 attribute * nla_get_s16(nla) get payload for a s16 attribute * nla_get_s32(nla) get payload for a s32 attribute * nla_get_s64(nla) get payload for a s64 attribute * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * * The same functions also exist with _default(). * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area * nla_strscpy(dst, nla, size) copy attribute to a sized string * nla_strcmp(nla, str) compare attribute with string * * Attribute Parsing: * nla_ok(nla, remaining) does nla fit into remaining bytes? * nla_next(nla, remaining) get next netlink attribute * nla_validate() validate a stream of attributes * nla_validate_nested() validate a stream of nested attributes * nla_find() find attribute in stream of attributes * nla_find_nested() find attribute in nested attributes * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes * nla_for_each_attr_type() loop over all attributes with the * given type * nla_for_each_nested() loop over the nested attributes * nla_for_each_nested_type() loop over the nested attributes with * the given type *========================================================================= */ /** * Standard attribute types to specify validation policy */ enum { NLA_UNSPEC, NLA_U8, NLA_U16, NLA_U32, NLA_U64, NLA_STRING, NLA_FLAG, NLA_MSECS, NLA_NESTED, NLA_NESTED_ARRAY, NLA_NUL_STRING, NLA_BINARY, NLA_S8, NLA_S16, NLA_S32, NLA_S64, NLA_BITFIELD32, NLA_REJECT, NLA_BE16, NLA_BE32, NLA_SINT, NLA_UINT, __NLA_TYPE_MAX, }; #define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) struct netlink_range_validation { u64 min, max; }; struct netlink_range_validation_signed { s64 min, max; }; enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; /** * struct nla_policy - attribute validation policy * @type: Type of attribute or NLA_UNSPEC * @validation_type: type of attribute validation done in addition to * type-specific validation (e.g. range, function call), see * &enum nla_policy_validation * @len: Type specific length of payload * * Policies are defined as arrays of this struct, the array must be * accessible by attribute type up to the highest identifier to be expected. * * Meaning of `len' field: * NLA_STRING Maximum length of string * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload * (but see also below with the validation type) * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. * NLA_SINT, NLA_UINT, * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, * NLA_BE16, NLA_BE32, * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused * All other Minimum length of attribute payload * * Meaning of validation union: * NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and * `bitfield32_valid' is the u32 value of valid flags * NLA_REJECT This attribute is always rejected and `reject_message' * may point to a string to report as the error instead * of the generic one in extended ACK. * NLA_NESTED `nested_policy' to a nested policy to validate, must * also set `len' to the max attribute number. Use the * provided NLA_POLICY_NESTED() macro. * Note that nla_parse() will validate, but of course not * parse, the nested sub-policies. * NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate, * must also set `len' to the max attribute number. Use * the provided NLA_POLICY_NESTED_ARRAY() macro. * The difference to NLA_NESTED is the structure: * NLA_NESTED has the nested attributes directly inside * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, * NLA_BE16, * NLA_BE32, * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 The `min' and `max' fields are used depending on the * validation_type field, if that is min/max/range then * the min, max or both are used (respectively) to check * the value of the integer attribute. * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range * of s16 - do that using the NLA_POLICY_FULL_RANGE() * or NLA_POLICY_FULL_RANGE_SIGNED() macros instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range' must be a pointer * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range_signed' must be a * pointer to a struct netlink_range_validation_signed * that indicates the min/max values. * Use NLA_POLICY_FULL_RANGE_SIGNED(). * * NLA_BINARY If the validation type is like the ones for integers * above, then the min/max length (not value like for * integers) of the attribute is enforced. * * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, * NLA_MSECS, * NLA_BINARY Validation function called for the attribute. * * All other Unused - but note that it's a union * * Example: * * static const u32 myvalidflags = 0xff231023; * * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)), * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ struct nla_policy { u8 type; u8 validation_type; u16 len; union { /** * @strict_start_type: first attribute to validate strictly * * This entry is special, and used for the attribute at index 0 * only, and specifies special data about the policy, namely it * specifies the "boundary type" where strict length validation * starts for any attribute types >= this value, also, strict * nesting validation starts here. * * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to * get the previous pure { .len = xyz } behaviour. The advantage * of this is that types not specified in the policy will be * rejected. * * For completely new families it should be set to 1 so that the * validation is enforced for all attributes. For existing ones * it should be set at least when new attributes are added to * the enum used by the policy, and be set to the new value that * was added to enforce strict validation from thereon. */ u16 strict_start_type; /* private: use NLA_POLICY_*() to set */ const u32 bitfield32_valid; const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; const struct netlink_range_validation *range; const struct netlink_range_validation_signed *range_signed; struct { s16 min, max; }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); }; }; #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) #define _NLA_POLICY_NESTED(maxattr, policy) \ { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr } #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \ { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr } #define NLA_POLICY_NESTED(policy) \ _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_NESTED_ARRAY(policy) \ _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_IS_UINT_TYPE(tp) \ (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ tp == NLA_U64 || tp == NLA_UINT || \ tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \ tp == NLA_SINT) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp) #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ __NLA_IS_SINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ tp != NLA_NESTED && \ tp != NLA_NESTED_ARRAY) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE, \ .min = _min, \ .max = _max \ } #define NLA_POLICY_FULL_RANGE(tp, _range) { \ .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range = _range, \ } #define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) { \ .type = NLA_ENSURE_SINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range_signed = _range, \ } #define NLA_POLICY_MIN(tp, _min) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ .min = _min, \ } #define NLA_POLICY_MAX(tp, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ } #define NLA_POLICY_MASK(tp, _mask) { \ .type = NLA_ENSURE_UINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_MASK, \ .mask = _mask, \ } #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ .validate = fn, \ .len = __VA_ARGS__ + 0, \ } #define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len) #define NLA_POLICY_EXACT_LEN_WARN(_len) { \ .type = NLA_BINARY, \ .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \ .min = _len, \ .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) #define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information * @nlh: Netlink message header of original request * @nl_net: Network namespace * @portid: Netlink PORTID of requesting application * @skip_notify: Skip netlink notifications to user space * @skip_notify_kernel: Skip selected in-kernel notifications */ struct nl_info { struct nlmsghdr *nlh; struct net *nl_net; u32 portid; u8 skip_notify:1, skip_notify_kernel:1; }; /** * enum netlink_validation - netlink message/attribute validation levels * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about * extra data at the end of the message, attributes being longer than * they should be, or unknown attributes being present. * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing. * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING * this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict(). * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy. * This can safely be set by the kernel when the given policy has no * NLA_UNSPEC anymore, and can thus be used to ensure policy entries * are enforced going forward. * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. * U8, U16, U32 must have exact size, etc.) * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY) * and unset for other policies. */ enum netlink_validation { NL_VALIDATE_LIBERAL = 0, NL_VALIDATE_TRAILING = BIT(0), NL_VALIDATE_MAXTYPE = BIT(1), NL_VALIDATE_UNSPEC = BIT(2), NL_VALIDATE_STRICT_ATTRS = BIT(3), NL_VALIDATE_NESTED = BIT(4), }; #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE) #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE |\ NL_VALIDATE_UNSPEC |\ NL_VALIDATE_STRICT_ATTRS |\ NL_VALIDATE_NESTED) int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize); char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); int nla_strcmp(const struct nlattr *nla, const char *str); struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen); struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen); void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_append(struct sk_buff *skb, int attrlen, const void *data); /************************************************************************** * Netlink Messages **************************************************************************/ /** * nlmsg_msg_size - length of netlink message not including padding * @payload: length of message payload */ static inline int nlmsg_msg_size(int payload) { return NLMSG_HDRLEN + payload; } /** * nlmsg_total_size - length of netlink message including padding * @payload: length of message payload */ static inline int nlmsg_total_size(int payload) { return NLMSG_ALIGN(nlmsg_msg_size(payload)); } /** * nlmsg_padlen - length of padding at the message's tail * @payload: length of message payload */ static inline int nlmsg_padlen(int payload) { return nlmsg_total_size(payload) - nlmsg_msg_size(payload); } /** * nlmsg_data - head of message payload * @nlh: netlink message header */ static inline void *nlmsg_data(const struct nlmsghdr *nlh) { return (unsigned char *) nlh + NLMSG_HDRLEN; } /** * nlmsg_len - length of message payload * @nlh: netlink message header */ static inline int nlmsg_len(const struct nlmsghdr *nlh) { return nlh->nlmsg_len - NLMSG_HDRLEN; } /** * nlmsg_payload - message payload if the data fits in the len * @nlh: netlink message header * @len: struct length * * Returns: The netlink message payload/data if the length is sufficient, * otherwise NULL. */ static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) { if (nlh->nlmsg_len < nlmsg_msg_size(len)) return NULL; return nlmsg_data(nlh); } /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh, int hdrlen) { unsigned char *data = nlmsg_data(nlh); return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen)); } /** * nlmsg_attrlen - length of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen) { return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen); } /** * nlmsg_ok - check if the netlink message fits into the remaining bytes * @nlh: netlink message header * @remaining: number of bytes remaining in message stream */ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) { return (remaining >= (int) sizeof(struct nlmsghdr) && nlh->nlmsg_len >= sizeof(struct nlmsghdr) && nlh->nlmsg_len <= remaining); } /** * nlmsg_next - next netlink message in message stream * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * nlmsg_next(const struct nlmsghdr *nlh, int *remaining) { int totlen = NLMSG_ALIGN(nlh->nlmsg_len); *remaining -= totlen; return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); } /** * nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_deprecated - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * __nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() */ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL; } return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy, validate, extack); } /** * nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_parse_deprecated - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_parse_deprecated_strict - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated_strict() */ static inline int nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * nlmsg_find_attr - find a specific attribute in a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) { return nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), attrtype); } /** * nla_validate_deprecated - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in liberal mode. * See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in strict mode. * See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header * @hdrlen: length of family specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, int hdrlen, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; return __nla_validate(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_report - need to report back to application? * @nlh: netlink message header * * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0; } /** * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * * Returns: 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { return nlh ? nlh->nlmsg_seq : 0; } /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @nlh: netlink message header * @hdrlen: length of family specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \ nlmsg_attrlen(nlh, hdrlen), rem) /** * nlmsg_for_each_attr_type - iterate over a stream of attributes * @pos: loop counter, set to the current attribute * @type: required attribute type for @pos * @nlh: netlink message header * @hdrlen: length of the family specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \ nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ if (nla_type(pos) == type) /** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application * @seq: sequence number of message * @type: message type * @payload: length of message payload * @flags: message flags * * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int payload, int flags) { if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload))) return NULL; return __nlmsg_put(skb, portid, seq, type, payload, flags); } /** * nlmsg_append - Add more data to a nlmsg in a skb * @skb: socket buffer to store message in * @size: length of message payload * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). * Returns: NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) { if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) return NULL; if (NLMSG_ALIGN(size) - size) memset(skb_tail_pointer(skb) + size, 0, NLMSG_ALIGN(size) - size); return __skb_put(skb, NLMSG_ALIGN(size)); } /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in * @cb: netlink callback * @type: message type * @payload: length of message payload * @flags: message flags * * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, struct netlink_callback *cb, int type, int payload, int flags) { return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, type, payload, flags); } /** * nlmsg_new - Allocate a new netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. * * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known * and a good default is needed. */ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) { return alloc_skb(nlmsg_total_size(payload), flags); } /** * nlmsg_new_large - Allocate a new netlink message with non-contiguous * physical memory * @payload: size of the message payload * * The allocated skb is unable to have frag page for shinfo->frags*, * as the NULL setting for skb->head in netlink_skb_destructor() will * bypass most of the handling in skb_release_data() */ static inline struct sk_buff *nlmsg_new_large(size_t payload) { return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); } /** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Corrects the netlink message header to include the appended * attributes. Only necessary if attributes have been added to * the message. */ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) { nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; } /** * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { return skb_tail_pointer(skb); } /** * nlmsg_trim - Trim message to a mark * @skb: socket buffer the message is stored in * @mark: mark to trim to * * Trims the message to the provided mark. */ static inline void nlmsg_trim(struct sk_buff *skb, const void *mark) { if (mark) { WARN_ON((unsigned char *) mark < skb->data); skb_trim(skb, (unsigned char *) mark - skb->data); } } /** * nlmsg_cancel - Cancel construction of a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Removes the complete netlink message including all * attributes from the socket buffer again. */ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) { nlmsg_trim(skb, nlh); } /** * nlmsg_free - drop a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) { kfree_skb(skb); } /** * nlmsg_consume - free a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_consume(struct sk_buff *skb) { consume_skb(skb); } /** * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags * @filter: filter function * @filter_data: filter function private data * * Return: 0 on success, negative error code for failure. */ static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags, netlink_filter_fn filter, void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; err = netlink_broadcast_filtered(sk, skb, portid, group, flags, filter, filter_data); if (err > 0) err = 0; return err; } /** * nlmsg_multicast - multicast a netlink message * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags */ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return nlmsg_multicast_filtered(sk, skb, portid, group, flags, NULL, NULL); } /** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid) { int err; err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT); if (err > 0) err = 0; return err; } /** * nlmsg_for_each_msg - iterate over a stream of messages * @pos: loop counter, set to current message * @head: head of message stream * @len: length of message stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_msg(pos, head, len, rem) \ for (pos = head, rem = len; \ nlmsg_ok(pos, rem); \ pos = nlmsg_next(pos, &(rem))) /** * nl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @nlh: netlink message header to write the flag to * * This function checks if the sequence (generation) number changed during dump * and if it did, advertises it in the netlink message header. * * The correct way to use it is to set cb->seq to the generation counter when * all locks for dumping have been acquired, and then call this function for * each message that is generated. * * Note that due to initialisation concerns, 0 is an invalid sequence number * and must not be used by code that uses this functionality. */ static inline void nl_dump_check_consistent(struct netlink_callback *cb, struct nlmsghdr *nlh) { if (cb->prev_seq && cb->seq != cb->prev_seq) nlh->nlmsg_flags |= NLM_F_DUMP_INTR; cb->prev_seq = cb->seq; } /************************************************************************** * Netlink Attributes **************************************************************************/ /** * nla_attr_size - length of attribute not including padding * @payload: length of payload */ static inline int nla_attr_size(int payload) { return NLA_HDRLEN + payload; } /** * nla_total_size - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size(int payload) { return NLA_ALIGN(nla_attr_size(payload)); } /** * nla_padlen - length of padding at the tail of attribute * @payload: length of payload */ static inline int nla_padlen(int payload) { return nla_total_size(payload) - nla_attr_size(payload); } /** * nla_type - attribute type * @nla: netlink attribute */ static inline int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; } /** * nla_data - head of payload * @nla: netlink attribute */ static inline void *nla_data(const struct nlattr *nla) { return (char *) nla + NLA_HDRLEN; } /** * nla_len - length of payload * @nla: netlink attribute */ static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } /** * nla_ok - check if the netlink attribute fits into the remaining bytes * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream */ static inline int nla_ok(const struct nlattr *nla, int remaining) { return remaining >= (int) sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } /** * nla_next - next netlink attribute in attribute stream * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { unsigned int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; return (struct nlattr *) ((char *) nla + totlen); } /** * nla_find_nested - find attribute in a set of nested attributes * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) { return nla_find(nla_data(nla), nla_len(nla), attrtype); } /** * nla_parse_nested - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (!(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing"); return -EINVAL; } return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_put_u8 - Add a u8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { /* temporary variables to work around GCC PR81715 with asan-stack=1 */ u8 tmp = value; return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** * nla_put_u16 - Add a u16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { u16 tmp = value; return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { __le16 tmp = value; return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** * nla_put_u32 - Add a u32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { u32 tmp = value; return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** * nla_put_uint - Add a variable-size unsigned int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value) { u64 tmp64 = value; u32 tmp32 = value; if (tmp64 == tmp32) return nla_put_u32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(u64), &tmp64); } /** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { __le32 tmp = value; return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { u64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } /** * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { __le64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** * nla_put_s8 - Add a s8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { s8 tmp = value; return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** * nla_put_s16 - Add a s16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { s16 tmp = value; return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** * nla_put_s32 - Add a s32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { s32 tmp = value; return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { s64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** * nla_put_sint - Add a variable-size signed int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value) { s64 tmp64 = value; s32 tmp32 = value; if (tmp64 == tmp32) return nla_put_s32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(s64), &tmp64); } /** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @str: NUL terminated string */ static inline int nla_put_string(struct sk_buff *skb, int attrtype, const char *str) { return nla_put(skb, attrtype, strlen(str) + 1, str); } /** * nla_put_flag - Add a flag netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type */ static inline int nla_put_flag(struct sk_buff *skb, int attrtype) { return nla_put(skb, attrtype, 0, NULL); } /** * nla_put_msecs - Add a msecs netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @njiffies: number of jiffies to convert to msecs * @padattr: attribute type for the padding */ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, unsigned long njiffies, int padattr) { u64 tmp = jiffies_to_msecs(njiffies); return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv4 address */ static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { __be32 tmp = addr; return nla_put_be32(skb, attrtype, tmp); } /** * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv6 address */ static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype, const struct in6_addr *addr) { return nla_put(skb, attrtype, sizeof(*addr), addr); } /** * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: value carrying bits * @selector: selector of valid bits */ static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype, __u32 value, __u32 selector) { struct nla_bitfield32 tmp = { value, selector, }; return nla_put(skb, attrtype, sizeof(tmp), &tmp); } /** * nla_get_u32 - return payload of u32 attribute * @nla: u32 netlink attribute */ static inline u32 nla_get_u32(const struct nlattr *nla) { return *(u32 *) nla_data(nla); } /** * nla_get_u32_default - return payload of u32 attribute or default * @nla: u32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) { if (!nla) return defvalue; return nla_get_u32(nla); } /** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute */ static inline __be32 nla_get_be32(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_be32_default - return payload of be32 attribute or default * @nla: __be32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_be32_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_be32(nla); } /** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute */ static inline __le32 nla_get_le32(const struct nlattr *nla) { return *(__le32 *) nla_data(nla); } /** * nla_get_le32_default - return payload of le32 attribute or default * @nla: __le32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le32 nla_get_le32_default(const struct nlattr *nla, __le32 defvalue) { if (!nla) return defvalue; return nla_get_le32(nla); } /** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute */ static inline u16 nla_get_u16(const struct nlattr *nla) { return *(u16 *) nla_data(nla); } /** * nla_get_u16_default - return payload of u16 attribute or default * @nla: u16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) { if (!nla) return defvalue; return nla_get_u16(nla); } /** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute */ static inline __be16 nla_get_be16(const struct nlattr *nla) { return *(__be16 *) nla_data(nla); } /** * nla_get_be16_default - return payload of be16 attribute or default * @nla: __be16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be16 nla_get_be16_default(const struct nlattr *nla, __be16 defvalue) { if (!nla) return defvalue; return nla_get_be16(nla); } /** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute */ static inline __le16 nla_get_le16(const struct nlattr *nla) { return *(__le16 *) nla_data(nla); } /** * nla_get_le16_default - return payload of le16 attribute or default * @nla: __le16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le16 nla_get_le16_default(const struct nlattr *nla, __le16 defvalue) { if (!nla) return defvalue; return nla_get_le16(nla); } /** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute */ static inline u8 nla_get_u8(const struct nlattr *nla) { return *(u8 *) nla_data(nla); } /** * nla_get_u8_default - return payload of u8 attribute or default * @nla: u8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) { if (!nla) return defvalue; return nla_get_u8(nla); } /** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute */ static inline u64 nla_get_u64(const struct nlattr *nla) { u64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_u64_default - return payload of u64 attribute or default * @nla: u64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_u64(nla); } /** * nla_get_uint - return payload of uint attribute * @nla: uint netlink attribute */ static inline u64 nla_get_uint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(u32)) return nla_get_u32(nla); return nla_get_u64(nla); } /** * nla_get_uint_default - return payload of uint attribute or default * @nla: uint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_uint(nla); } /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute */ static inline __be64 nla_get_be64(const struct nlattr *nla) { __be64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_be64_default - return payload of be64 attribute or default * @nla: __be64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be64 nla_get_be64_default(const struct nlattr *nla, __be64 defvalue) { if (!nla) return defvalue; return nla_get_be64(nla); } /** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute */ static inline __le64 nla_get_le64(const struct nlattr *nla) { return *(__le64 *) nla_data(nla); } /** * nla_get_le64_default - return payload of le64 attribute or default * @nla: __le64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le64 nla_get_le64_default(const struct nlattr *nla, __le64 defvalue) { if (!nla) return defvalue; return nla_get_le64(nla); } /** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute */ static inline s32 nla_get_s32(const struct nlattr *nla) { return *(s32 *) nla_data(nla); } /** * nla_get_s32_default - return payload of s32 attribute or default * @nla: s32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) { if (!nla) return defvalue; return nla_get_s32(nla); } /** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute */ static inline s16 nla_get_s16(const struct nlattr *nla) { return *(s16 *) nla_data(nla); } /** * nla_get_s16_default - return payload of s16 attribute or default * @nla: s16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) { if (!nla) return defvalue; return nla_get_s16(nla); } /** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute */ static inline s8 nla_get_s8(const struct nlattr *nla) { return *(s8 *) nla_data(nla); } /** * nla_get_s8_default - return payload of s8 attribute or default * @nla: s8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) { if (!nla) return defvalue; return nla_get_s8(nla); } /** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute */ static inline s64 nla_get_s64(const struct nlattr *nla) { s64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_s64_default - return payload of s64 attribute or default * @nla: s64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_s64(nla); } /** * nla_get_sint - return payload of uint attribute * @nla: uint netlink attribute */ static inline s64 nla_get_sint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(s32)) return nla_get_s32(nla); return nla_get_s64(nla); } /** * nla_get_sint_default - return payload of sint attribute or default * @nla: sint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_sint(nla); } /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute */ static inline int nla_get_flag(const struct nlattr *nla) { return !!nla; } /** * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { u64 msecs = nla_get_u64(nla); return msecs_to_jiffies((unsigned long) msecs); } /** * nla_get_msecs_default - return payload of msecs attribute or default * @nla: msecs netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, unsigned long defvalue) { if (!nla) return defvalue; return nla_get_msecs(nla); } /** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute */ static inline __be32 nla_get_in_addr(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_in_addr_default - return payload of be32 attribute or default * @nla: IPv4 address netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_in_addr(nla); } /** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute */ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) { struct in6_addr tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_bitfield32 - return payload of 32 bitfield attribute * @nla: nla_bitfield32 attribute */ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) { struct nla_bitfield32 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_memdup - duplicate attribute memory (kmemdup) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } #define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * This function exists for backward compatibility to use in APIs which never * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) { struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb); if (nla_put(skb, attrtype, 0, NULL) < 0) return NULL; return start; } /** * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED); } /** * nla_nest_end - Finalize nesting of attributes * @skb: socket buffer the attributes are stored in * @start: container attribute * * Corrects the container attribute header to include the all * appended attributes. * * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start; return skb->len; } /** * nla_nest_cancel - Cancel nesting of attributes * @skb: socket buffer the message is stored in * @start: container attribute * * Removes the container attribute and including all nested * attributes. Returns -EMSGSIZE */ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) { nlmsg_trim(skb, start); } /** * nla_put_empty_nest - Create an empty nest * @skb: socket buffer the message is stored in * @attrtype: attribute type of the container * * This function is a helper for creating empty nests. * * Returns: 0 when successful or -EMSGSIZE on failure. */ static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype) { return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE; } /** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documentation of struct nla_policy for more details. * * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate(nla_data(start), nla_len(start), maxtype, policy, validate, extack); } static inline int nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_STRICT, extack); } static inline int nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) { #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* The nlattr header is 4 bytes in size, that's why we test * if the skb->data _is_ aligned. A NOP attribute, plus * nlattr header for next attribute, will make nla_data() * 8-byte aligned. */ if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) return true; #endif return false; } /** * nla_align_64bit - 64-bit align the nla_data() of next attribute * @skb: socket buffer the message is stored in * @padattr: attribute type for the padding * * Conditionally emit a padding netlink attribute in order to make * the next attribute we emit have a 64-bit aligned nla_data() area. * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { if (nla_need_padding_for_64bit(skb) && !nla_reserve(skb, padattr, 0)) return -EMSGSIZE; return 0; } /** * nla_total_size_64bit - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size_64bit(int payload) { return NLA_ALIGN(nla_attr_size(payload)) #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + NLA_ALIGN(nla_attr_size(0)) #endif ; } /** * nla_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr(pos, head, len, rem) \ for (pos = head, rem = len; \ nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) /** * nla_for_each_attr_type - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr_type(pos, type, head, len, rem) \ nla_for_each_attr(pos, head, len, rem) \ if (nla_type(pos) == type) /** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested(pos, nla, rem) \ nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** * nla_for_each_nested_type - iterate over nested attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested_type(pos, type, nla, rem) \ nla_for_each_nested(pos, nla, rem) \ if (nla_type(pos) == type) /** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream */ static inline bool nla_is_last(const struct nlattr *nla, int rem) { return nla->nla_len == rem; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range); void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); struct netlink_policy_dump_state; int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate, const struct nla_policy *policy, unsigned int maxtype); int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, const struct nla_policy *policy, unsigned int maxtype); bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); int netlink_policy_dump_write_attr(struct sk_buff *skb, const struct nla_policy *pt, int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif
5 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "truncate.h" struct io_ftrunc { struct file *file; loff_t len; }; int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index || sqe->splice_fd_in || sqe->addr3) return -EINVAL; ft->len = READ_ONCE(sqe->off); req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) { struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_ftruncate(req->file, ft->len, 1); io_req_set_res(req, ret, 0); return IOU_COMPLETE; }
4 2 1 1 22 2 16 7 11 14 5 5 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/io_uring.h> #include <uapi/linux/fadvise.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "advise.h" struct io_fadvise { struct file *file; u64 offset; u64 len; u32 advice; }; struct io_madvise { struct file *file; u64 addr; u64 len; u32 advice; }; int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; ma->addr = READ_ONCE(sqe->addr); ma->len = READ_ONCE(sqe->off); if (!ma->len) ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); req->flags |= REQ_F_FORCE_ASYNC; return 0; #else return -EOPNOTSUPP; #endif } int io_madvise(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); return IOU_COMPLETE; #else return -EOPNOTSUPP; #endif } static bool io_fadvise_force_async(struct io_fadvise *fa) { switch (fa->advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_SEQUENTIAL: return false; default: return true; } } int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; fa->offset = READ_ONCE(sqe->off); fa->len = READ_ONCE(sqe->addr); if (!fa->len) fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); if (io_fadvise_force_async(fa)) req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa)); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_COMPLETE; }
19 1 9 8 1 3 7 1 1 5 1 3 2 14 7 1 4 1 1 5 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match L2TP header parameters. */ /* (C) 2013 James Chapman <jchapman@katalix.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <net/ip.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/udp.h> #include <linux/l2tp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter/xt_l2tp.h> /* L2TP header masks */ #define L2TP_HDR_T_BIT 0x8000 #define L2TP_HDR_L_BIT 0x4000 #define L2TP_HDR_VER 0x000f MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("Xtables: L2TP header match"); MODULE_ALIAS("ipt_l2tp"); MODULE_ALIAS("ip6t_l2tp"); /* The L2TP fields that can be matched */ struct l2tp_data { u32 tid; u32 sid; u8 type; u8 version; }; union l2tp_val { __be16 val16[2]; __be32 val32; }; static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data) { if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type)) return false; if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version)) return false; /* Check tid only for L2TPv3 control or any L2TPv2 packets */ if ((info->flags & XT_L2TP_TID) && ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) && (info->tid != data->tid)) return false; /* Check sid only for L2TP data packets */ if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) && (info->sid != data->sid)) return false; return true; } /* Parse L2TP header fields when UDP encapsulation is used. Handles * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a * different format. See * RFC2661, Section 3.1, L2TPv2 Header Format * RFC3931, Section 3.2.1, L2TPv3 Control Message Header * RFC3931, Section 3.2.2, L2TPv3 Data Message Header * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP */ static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) { const struct xt_l2tp_info *info = par->matchinfo; int uhlen = sizeof(struct udphdr); int offs = thoff + uhlen; union l2tp_val *lh; union l2tp_val lhbuf; u16 flags; struct l2tp_data data = { 0, }; if (par->fragoff != 0) return false; /* Extract L2TP header fields. The flags in the first 16 bits * tell us where the other fields are. */ lh = skb_header_pointer(skb, offs, 2, &lhbuf); if (lh == NULL) return false; flags = ntohs(lh->val16[0]); if (flags & L2TP_HDR_T_BIT) data.type = XT_L2TP_TYPE_CONTROL; else data.type = XT_L2TP_TYPE_DATA; data.version = (u8) flags & L2TP_HDR_VER; /* Now extract the L2TP tid/sid. These are in different places * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we * must also check to see if the length field is present, * since this affects the offsets into the packet of the * tid/sid fields. */ if (data.version == 3) { lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf); if (lh == NULL) return false; if (data.type == XT_L2TP_TYPE_CONTROL) data.tid = ntohl(lh->val32); else data.sid = ntohl(lh->val32); } else if (data.version == 2) { if (flags & L2TP_HDR_L_BIT) offs += 2; lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf); if (lh == NULL) return false; data.tid = (u32) ntohs(lh->val16[0]); data.sid = (u32) ntohs(lh->val16[1]); } else return false; return l2tp_match(info, &data); } /* Parse L2TP header fields for IP encapsulation (no UDP header). * L2TPv3 data packets have a different form with IP encap. See * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP. * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP. */ static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) { const struct xt_l2tp_info *info = par->matchinfo; union l2tp_val *lh; union l2tp_val lhbuf; struct l2tp_data data = { 0, }; /* For IP encap, the L2TP sid is the first 32-bits. */ lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf); if (lh == NULL) return false; if (lh->val32 == 0) { /* Must be a control packet. The L2TP tid is further * into the packet. */ data.type = XT_L2TP_TYPE_CONTROL; lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf), &lhbuf); if (lh == NULL) return false; data.tid = ntohl(lh->val32); } else { data.sid = ntohl(lh->val32); data.type = XT_L2TP_TYPE_DATA; } data.version = 3; return l2tp_match(info, &data); } static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par) { struct iphdr *iph = ip_hdr(skb); u8 ipproto = iph->protocol; /* l2tp_mt_check4 already restricts the transport protocol */ switch (ipproto) { case IPPROTO_UDP: return l2tp_udp_mt(skb, par, par->thoff); case IPPROTO_L2TP: return l2tp_ip_mt(skb, par, par->thoff); } return false; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par) { unsigned int thoff = 0; unsigned short fragoff = 0; int ipproto; ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); if (fragoff != 0) return false; /* l2tp_mt_check6 already restricts the transport protocol */ switch (ipproto) { case IPPROTO_UDP: return l2tp_udp_mt(skb, par, thoff); case IPPROTO_L2TP: return l2tp_ip_mt(skb, par, thoff); } return false; } #endif static int l2tp_mt_check(const struct xt_mtchk_param *par) { const struct xt_l2tp_info *info = par->matchinfo; /* Check for invalid flags */ if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | XT_L2TP_TYPE)) { pr_info_ratelimited("unknown flags: %x\n", info->flags); return -EINVAL; } /* At least one of tid, sid or type=control must be specified */ if ((!(info->flags & XT_L2TP_TID)) && (!(info->flags & XT_L2TP_SID)) && ((!(info->flags & XT_L2TP_TYPE)) || (info->type != XT_L2TP_TYPE_CONTROL))) { pr_info_ratelimited("invalid flags combination: %x\n", info->flags); return -EINVAL; } /* If version 2 is specified, check that incompatible params * are not supplied */ if (info->flags & XT_L2TP_VERSION) { if ((info->version < 2) || (info->version > 3)) { pr_info_ratelimited("wrong L2TP version: %u\n", info->version); return -EINVAL; } if (info->version == 2) { if ((info->flags & XT_L2TP_TID) && (info->tid > 0xffff)) { pr_info_ratelimited("v2 tid > 0xffff: %u\n", info->tid); return -EINVAL; } if ((info->flags & XT_L2TP_SID) && (info->sid > 0xffff)) { pr_info_ratelimited("v2 sid > 0xffff: %u\n", info->sid); return -EINVAL; } } } return 0; } static int l2tp_mt_check4(const struct xt_mtchk_param *par) { const struct xt_l2tp_info *info = par->matchinfo; const struct ipt_entry *e = par->entryinfo; const struct ipt_ip *ip = &e->ip; int ret; ret = l2tp_mt_check(par); if (ret != 0) return ret; if ((ip->proto != IPPROTO_UDP) && (ip->proto != IPPROTO_L2TP)) { pr_info_ratelimited("missing protocol rule (udp|l2tpip)\n"); return -EINVAL; } if ((ip->proto == IPPROTO_L2TP) && (info->version == 2)) { pr_info_ratelimited("v2 doesn't support IP mode\n"); return -EINVAL; } return 0; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int l2tp_mt_check6(const struct xt_mtchk_param *par) { const struct xt_l2tp_info *info = par->matchinfo; const struct ip6t_entry *e = par->entryinfo; const struct ip6t_ip6 *ip = &e->ipv6; int ret; ret = l2tp_mt_check(par); if (ret != 0) return ret; if ((ip->proto != IPPROTO_UDP) && (ip->proto != IPPROTO_L2TP)) { pr_info_ratelimited("missing protocol rule (udp|l2tpip)\n"); return -EINVAL; } if ((ip->proto == IPPROTO_L2TP) && (info->version == 2)) { pr_info_ratelimited("v2 doesn't support IP mode\n"); return -EINVAL; } return 0; } #endif static struct xt_match l2tp_mt_reg[] __read_mostly = { { .name = "l2tp", .revision = 0, .family = NFPROTO_IPV4, .match = l2tp_mt4, .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), .checkentry = l2tp_mt_check4, .hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD)), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "l2tp", .revision = 0, .family = NFPROTO_IPV6, .match = l2tp_mt6, .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), .checkentry = l2tp_mt_check6, .hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD)), .me = THIS_MODULE, }, #endif }; static int __init l2tp_mt_init(void) { return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); } static void __exit l2tp_mt_exit(void) { xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); } module_init(l2tp_mt_init); module_exit(l2tp_mt_exit);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ /* * Management Component Transport Protocol (MCTP) * * Copyright (c) 2021 Code Construct * Copyright (c) 2021 Google */ #ifndef __NET_MCTP_H #define __NET_MCTP_H #include <linux/bits.h> #include <linux/mctp.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/sock.h> /* MCTP packet definitions */ struct mctp_hdr { u8 ver; u8 dest; u8 src; u8 flags_seq_tag; }; #define MCTP_VER_MIN 1 #define MCTP_VER_MAX 1 /* Definitions for flags_seq_tag field */ #define MCTP_HDR_FLAG_SOM BIT(7) #define MCTP_HDR_FLAG_EOM BIT(6) #define MCTP_HDR_FLAG_TO BIT(3) #define MCTP_HDR_FLAGS GENMASK(5, 3) #define MCTP_HDR_SEQ_SHIFT 4 #define MCTP_HDR_SEQ_MASK GENMASK(1, 0) #define MCTP_HDR_TAG_SHIFT 0 #define MCTP_HDR_TAG_MASK GENMASK(2, 0) #define MCTP_INITIAL_DEFAULT_NET 1 static inline bool mctp_address_unicast(mctp_eid_t eid) { return eid >= 8 && eid < 255; } static inline bool mctp_address_broadcast(mctp_eid_t eid) { return eid == 255; } static inline bool mctp_address_null(mctp_eid_t eid) { return eid == 0; } static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) { return match == eid || match == MCTP_ADDR_ANY; } static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb) { return (struct mctp_hdr *)skb_network_header(skb); } /* socket implementation */ struct mctp_sock { struct sock sk; /* bind() params */ unsigned int bind_net; mctp_eid_t bind_local_addr; mctp_eid_t bind_peer_addr; unsigned int bind_peer_net; bool bind_peer_set; __u8 bind_type; /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ bool addr_ext; /* list of mctp_sk_key, for incoming tag lookup. updates protected * by sk->net->keys_lock */ struct hlist_head keys; /* mechanism for expiring allocated keys; will release an allocated * tag, and any netdev state for a request/response pairing */ struct timer_list key_expiry; }; /* Key for matching incoming packets to sockets or reassembly contexts. * Packets are matched on (peer EID, local EID, tag). * * Lifetime / locking requirements: * * - individual key data (ie, the struct itself) is protected by key->lock; * changes must be made with that lock held. * * - the lookup fields: peer_addr, local_addr and tag are set before the * key is added to lookup lists, and never updated. * * - A ref to the key must be held (throuh key->refs) if a pointer to the * key is to be accessed after key->lock is released. * * - a mctp_sk_key contains a reference to a struct sock; this is valid * for the life of the key. On sock destruction (through unhash), the key is * removed from lists (see below), and marked invalid. * * - these mctp_sk_keys appear on two lists: * 1) the struct mctp_sock->keys list * 2) the struct netns_mctp->keys list * * presences on these lists requires a (single) refcount to be held; both * lists are updated as a single operation. * * Updates and lookups in either list are performed under the * netns_mctp->keys lock. Lookup functions will need to lock the key and * take a reference before unlocking the keys_lock. Consequently, the list's * keys_lock *cannot* be acquired with the individual key->lock held. * * - a key may have a sk_buff attached as part of an in-progress message * reassembly (->reasm_head). The reasm data is protected by the individual * key->lock. * * - there are two destruction paths for a mctp_sk_key: * * - through socket unhash (see mctp_sk_unhash). This performs the list * removal under keys_lock. * * - where a key is established to receive a reply message: after receiving * the (complete) reply, or during reassembly errors. Here, we clean up * the reassembly context (marking reasm_dead, to prevent another from * starting), and remove the socket from the netns & socket lists. * * - through an expiry timeout, on a per-socket timer */ struct mctp_sk_key { unsigned int net; mctp_eid_t peer_addr; mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ /* we hold a ref to sk when set */ struct sock *sk; /* routing lookup list */ struct hlist_node hlist; /* per-socket list */ struct hlist_node sklist; /* lock protects against concurrent updates to the reassembly and * expiry data below. */ spinlock_t lock; /* Keys are referenced during the output path, which may sleep */ refcount_t refs; /* incoming fragment reassembly context */ struct sk_buff *reasm_head; struct sk_buff **reasm_tailp; bool reasm_dead; u8 last_seq; /* key validity */ bool valid; /* expiry timeout; valid (above) cleared on expiry */ unsigned long expiry; /* free to use for device flow state tracking. Initialised to * zero on initial key creation */ unsigned long dev_flow_state; struct mctp_dev *dev; /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire * automatically on timeout or response, instead SIOCMCTPDROPTAG * is used. */ bool manual_alloc; }; struct mctp_skb_cb { unsigned int magic; unsigned int net; /* fields below provide extended addressing for ingress to recvmsg() */ int ifindex; unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; }; /* skb control-block accessors with a little extra debugging for initial * development. * * TODO: remove checks & mctp_skb_cb->magic; replace callers of __mctp_cb * with mctp_cb(). * * __mctp_cb() is only for the initial ingress code; we should see ->magic set * at all times after this. */ static inline struct mctp_skb_cb *__mctp_cb(struct sk_buff *skb) { struct mctp_skb_cb *cb = (void *)skb->cb; cb->magic = 0x4d435450; return cb; } static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) { struct mctp_skb_cb *cb = (void *)skb->cb; BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); WARN_ON(cb->magic != 0x4d435450); return cb; } /* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, * indicating the flow to the device driver. */ struct mctp_flow { struct mctp_sk_key *key; }; struct mctp_dst; /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for * removed routes. We hold a reference to the netdev; routes need to be * dropped on NETDEV_UNREGISTER events. * * Updates to the route table are performed under rtnl; all reads under RCU, * so routes cannot be referenced over a RCU grace period. */ struct mctp_route { mctp_eid_t min, max; unsigned char type; unsigned int mtu; enum { MCTP_ROUTE_DIRECT, MCTP_ROUTE_GATEWAY, } dst_type; union { struct mctp_dev *dev; struct mctp_fq_addr gateway; }; int (*output)(struct mctp_dst *dst, struct sk_buff *skb); struct list_head list; refcount_t refs; struct rcu_head rcu; }; /* Route lookup result: dst. Represents the results of a routing decision, * but is only held over the individual routing operation. * * Will typically be stored on the caller stack, and must be released after * usage. */ struct mctp_dst { struct mctp_dev *dev; unsigned int mtu; mctp_eid_t nexthop; /* set for direct addressing */ unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; int (*output)(struct mctp_dst *dst, struct sk_buff *skb); }; int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, unsigned char halen, const unsigned char *haddr); /* route interfaces */ int mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr, struct mctp_dst *dst); void mctp_dst_release(struct mctp_dst *dst); /* always takes ownership of skb */ int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, unsigned int netid, mctp_eid_t local, mctp_eid_t peer, bool manual, u8 *tagp); /* routing <--> device interface */ unsigned int mctp_default_net(struct net *net); int mctp_default_net_set(struct net *net, unsigned int index); int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr); int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr); void mctp_route_remove_dev(struct mctp_dev *mdev); /* neighbour definitions */ enum mctp_neigh_source { MCTP_NEIGH_STATIC, MCTP_NEIGH_DISCOVER, }; struct mctp_neigh { struct mctp_dev *dev; mctp_eid_t eid; enum mctp_neigh_source source; unsigned char ha[MAX_ADDR_LEN]; struct list_head list; struct rcu_head rcu; }; int mctp_neigh_init(void); void mctp_neigh_exit(void); // ret_hwaddr may be NULL, otherwise must have space for MAX_ADDR_LEN int mctp_neigh_lookup(struct mctp_dev *dev, mctp_eid_t eid, void *ret_hwaddr); void mctp_neigh_remove_dev(struct mctp_dev *mdev); int mctp_routes_init(void); void mctp_routes_exit(void); int mctp_device_init(void); void mctp_device_exit(void); /* MCTP IDs and Codes from DMTF specification * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf */ enum mctp_phys_binding { MCTP_PHYS_BINDING_UNSPEC = 0x00, MCTP_PHYS_BINDING_SMBUS = 0x01, MCTP_PHYS_BINDING_PCIE_VDM = 0x02, MCTP_PHYS_BINDING_USB = 0x03, MCTP_PHYS_BINDING_KCS = 0x04, MCTP_PHYS_BINDING_SERIAL = 0x05, MCTP_PHYS_BINDING_I3C = 0x06, MCTP_PHYS_BINDING_MMBI = 0x07, MCTP_PHYS_BINDING_PCC = 0x08, MCTP_PHYS_BINDING_UCIE = 0x09, MCTP_PHYS_BINDING_VENDOR = 0xFF, }; #endif /* __NET_MCTP_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM thermal #if !defined(_TRACE_THERMAL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_THERMAL_H #include <linux/devfreq.h> #include <linux/thermal.h> #include <linux/tracepoint.h> #include "thermal_core.h" TRACE_DEFINE_ENUM(THERMAL_TRIP_CRITICAL); TRACE_DEFINE_ENUM(THERMAL_TRIP_HOT); TRACE_DEFINE_ENUM(THERMAL_TRIP_PASSIVE); TRACE_DEFINE_ENUM(THERMAL_TRIP_ACTIVE); #define show_tzt_type(type) \ __print_symbolic(type, \ { THERMAL_TRIP_CRITICAL, "CRITICAL"}, \ { THERMAL_TRIP_HOT, "HOT"}, \ { THERMAL_TRIP_PASSIVE, "PASSIVE"}, \ { THERMAL_TRIP_ACTIVE, "ACTIVE"}) TRACE_EVENT(thermal_temperature, TP_PROTO(struct thermal_zone_device *tz), TP_ARGS(tz), TP_STRUCT__entry( __string(thermal_zone, tz->type) __field(int, id) __field(int, temp_prev) __field(int, temp) ), TP_fast_assign( __assign_str(thermal_zone); __entry->id = tz->id; __entry->temp_prev = tz->last_temperature; __entry->temp = tz->temperature; ), TP_printk("thermal_zone=%s id=%d temp_prev=%d temp=%d", __get_str(thermal_zone), __entry->id, __entry->temp_prev, __entry->temp) ); TRACE_EVENT(cdev_update, TP_PROTO(struct thermal_cooling_device *cdev, unsigned long target), TP_ARGS(cdev, target), TP_STRUCT__entry( __string(type, cdev->type) __field(unsigned long, target) ), TP_fast_assign( __assign_str(type); __entry->target = target; ), TP_printk("type=%s target=%lu", __get_str(type), __entry->target) ); TRACE_EVENT(thermal_zone_trip, TP_PROTO(struct thermal_zone_device *tz, int trip, enum thermal_trip_type trip_type), TP_ARGS(tz, trip, trip_type), TP_STRUCT__entry( __string(thermal_zone, tz->type) __field(int, id) __field(int, trip) __field(enum thermal_trip_type, trip_type) ), TP_fast_assign( __assign_str(thermal_zone); __entry->id = tz->id; __entry->trip = trip; __entry->trip_type = trip_type; ), TP_printk("thermal_zone=%s id=%d trip=%d trip_type=%s", __get_str(thermal_zone), __entry->id, __entry->trip, show_tzt_type(__entry->trip_type)) ); #ifdef CONFIG_CPU_THERMAL TRACE_EVENT(thermal_power_cpu_get_power_simple, TP_PROTO(int cpu, u32 power), TP_ARGS(cpu, power), TP_STRUCT__entry( __field(int, cpu) __field(u32, power) ), TP_fast_assign( __entry->cpu = cpu; __entry->power = power; ), TP_printk("cpu=%d power=%u", __entry->cpu, __entry->power) ); TRACE_EVENT(thermal_power_cpu_limit, TP_PROTO(const struct cpumask *cpus, unsigned int freq, unsigned long cdev_state, u32 power), TP_ARGS(cpus, freq, cdev_state, power), TP_STRUCT__entry( __bitmask(cpumask, num_possible_cpus()) __field(unsigned int, freq ) __field(unsigned long, cdev_state) __field(u32, power ) ), TP_fast_assign( __assign_bitmask(cpumask, cpumask_bits(cpus), num_possible_cpus()); __entry->freq = freq; __entry->cdev_state = cdev_state; __entry->power = power; ), TP_printk("cpus=%s freq=%u cdev_state=%lu power=%u", __get_bitmask(cpumask), __entry->freq, __entry->cdev_state, __entry->power) ); #endif /* CONFIG_CPU_THERMAL */ #ifdef CONFIG_DEVFREQ_THERMAL TRACE_EVENT(thermal_power_devfreq_get_power, TP_PROTO(struct thermal_cooling_device *cdev, struct devfreq_dev_status *status, unsigned long freq, u32 power), TP_ARGS(cdev, status, freq, power), TP_STRUCT__entry( __string(type, cdev->type ) __field(unsigned long, freq ) __field(u32, busy_time) __field(u32, total_time) __field(u32, power) ), TP_fast_assign( __assign_str(type); __entry->freq = freq; __entry->busy_time = status->busy_time; __entry->total_time = status->total_time; __entry->power = power; ), TP_printk("type=%s freq=%lu load=%u power=%u", __get_str(type), __entry->freq, __entry->total_time == 0 ? 0 : (100 * __entry->busy_time) / __entry->total_time, __entry->power) ); TRACE_EVENT(thermal_power_devfreq_limit, TP_PROTO(struct thermal_cooling_device *cdev, unsigned long freq, unsigned long cdev_state, u32 power), TP_ARGS(cdev, freq, cdev_state, power), TP_STRUCT__entry( __string(type, cdev->type) __field(unsigned int, freq ) __field(unsigned long, cdev_state) __field(u32, power ) ), TP_fast_assign( __assign_str(type); __entry->freq = freq; __entry->cdev_state = cdev_state; __entry->power = power; ), TP_printk("type=%s freq=%u cdev_state=%lu power=%u", __get_str(type), __entry->freq, __entry->cdev_state, __entry->power) ); #endif /* CONFIG_DEVFREQ_THERMAL */ #endif /* _TRACE_THERMAL_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE thermal_trace /* This part must be outside protection */ #include <trace/define_trace.h>
4 1 3 5 1 4 2 1 1 2 1 1 5 2 1 5 6 1 1 1 2 9 6 3 9 3 4 5 5 5 1 3 6 3 2 1 1 3 3 1 2 8 7 7 7 5 5 5 4 19 1 1 1 1 8 5 3 1 1 1 10 1 6 3 1 1 3 1 3 5 5 9 1 8 1 10 8 1 7 7 7 6 1 6 1 7 65 65 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB * * Refer to: * draft-ietf-forces-interfelfb-03 * and * netdev01 paper: * "Distributing Linux Traffic Control Classifier-Action * Subsystem" * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai * * copyright Jamal Hadi Salim (2015) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_ife.h> #include <net/tc_act/tc_ife.h> #include <linux/etherdevice.h> #include <net/ife.h> #include <net/tc_wrapper.h> static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = { [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)}, [TCA_IFE_DMAC] = { .len = ETH_ALEN}, [TCA_IFE_SMAC] = { .len = ETH_ALEN}, [TCA_IFE_TYPE] = { .type = NLA_U16}, }; int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) { u16 edata = 0; if (mi->metaval) edata = *(u16 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htons(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u16); int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u32); int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+V == 2+2+4 */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u32); int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi) { if (metaval || mi->metaval) return 8; /* T+L+(V) == 2+2+(2+2bytepad) */ return 0; } EXPORT_SYMBOL_GPL(ife_check_meta_u16); int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) { u32 edata = metaval; if (mi->metaval) edata = *(u32 *)mi->metaval; else if (metaval) edata = metaval; if (!edata) /* will not encode */ return 0; edata = htonl(edata); return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata); } EXPORT_SYMBOL_GPL(ife_encode_meta_u32); int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval); else return nla_put(skb, mi->metaid, 0, NULL); } EXPORT_SYMBOL_GPL(ife_get_meta_u16); int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u32), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { mi->metaval = kmemdup(metaval, sizeof(u16), gfp); if (!mi->metaval) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(ife_alloc_meta_u16); void ife_release_meta_gen(struct tcf_meta_info *mi) { kfree(mi->metaval); } EXPORT_SYMBOL_GPL(ife_release_meta_gen); int ife_validate_meta_u32(void *val, int len) { if (len == sizeof(u32)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u32); int ife_validate_meta_u16(void *val, int len) { /* length will not include padding */ if (len == sizeof(u16)) return 0; return -EINVAL; } EXPORT_SYMBOL_GPL(ife_validate_meta_u16); static LIST_HEAD(ifeoplist); static DEFINE_RWLOCK(ife_mod_lock); static struct tcf_meta_ops *find_ife_oplist(u16 metaid) { struct tcf_meta_ops *o; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { if (o->metaid == metaid) { if (!try_module_get(o->owner)) o = NULL; read_unlock(&ife_mod_lock); return o; } } read_unlock(&ife_mod_lock); return NULL; } int register_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; if (!mops->metaid || !mops->metatype || !mops->name || !mops->check_presence || !mops->encode || !mops->decode || !mops->get || !mops->alloc) return -EINVAL; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid || (strcmp(mops->name, m->name) == 0)) { write_unlock(&ife_mod_lock); return -EEXIST; } } if (!mops->release) mops->release = ife_release_meta_gen; list_add_tail(&mops->list, &ifeoplist); write_unlock(&ife_mod_lock); return 0; } EXPORT_SYMBOL_GPL(unregister_ife_op); int unregister_ife_op(struct tcf_meta_ops *mops) { struct tcf_meta_ops *m; int err = -ENOENT; write_lock(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid) { list_del(&mops->list); err = 0; break; } } write_unlock(&ife_mod_lock); return err; } EXPORT_SYMBOL_GPL(register_ife_op); static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) { int ret = 0; /* XXX: unfortunately cant use nla_policy at this point * because a length of 0 is valid in the case of * "allow". "use" semantics do enforce for proper * length and i couldve use nla_policy but it makes it hard * to use it just for that.. */ if (ops->validate) return ops->validate(val, len); if (ops->metatype == NLA_U32) ret = ife_validate_meta_u32(val, len); else if (ops->metatype == NLA_U16) ret = ife_validate_meta_u16(val, len); return ret; } #ifdef CONFIG_MODULES static const char *ife_meta_id2name(u32 metaid) { switch (metaid) { case IFE_META_SKBMARK: return "skbmark"; case IFE_META_PRIO: return "skbprio"; case IFE_META_TCINDEX: return "tcindex"; default: return "unknown"; } } #endif /* called when adding new meta information */ static int load_metaops_and_vet(u32 metaid, void *val, int len, bool rtnl_held) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; if (!ops) { ret = -ENOENT; #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); request_module("ife-meta-%s", ife_meta_id2name(metaid)); if (rtnl_held) rtnl_lock(); ops = find_ife_oplist(metaid); #endif } if (ops) { ret = 0; if (len) ret = ife_validate_metatype(ops, val, len); module_put(ops->owner); } return ret; } /* called when adding new meta information */ static int __add_metainfo(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool atomic, bool exists) { struct tcf_meta_info *mi = NULL; int ret = 0; mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL); if (!mi) return -ENOMEM; mi->metaid = metaid; mi->ops = ops; if (len > 0) { ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL); if (ret != 0) { kfree(mi); return ret; } } if (exists) spin_lock_bh(&ife->tcf_lock); list_add_tail(&mi->metalist, &ife->metalist); if (exists) spin_unlock_bh(&ife->tcf_lock); return ret; } static int add_metainfo_and_get_ops(const struct tcf_meta_ops *ops, struct tcf_ife_info *ife, u32 metaid, bool exists) { int ret; if (!try_module_get(ops->owner)) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, NULL, 0, true, exists); if (ret) module_put(ops->owner); return ret; } static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, int len, bool exists) { const struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret; if (!ops) return -ENOENT; ret = __add_metainfo(ops, ife, metaid, metaval, len, false, exists); if (ret) /*put back what find_ife_oplist took */ module_put(ops->owner); return ret; } static int use_all_metadata(struct tcf_ife_info *ife, bool exists) { struct tcf_meta_ops *o; int rc = 0; int installed = 0; read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { rc = add_metainfo_and_get_ops(o, ife, o->metaid, exists); if (rc == 0) installed += 1; } read_unlock(&ife_mod_lock); if (installed) return 0; else return -EINVAL; } static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e; struct nlattr *nest; unsigned char *b = skb_tail_pointer(skb); int total_encoded = 0; /*can only happen on decode */ if (list_empty(&ife->metalist)) return 0; nest = nla_nest_start_noflag(skb, TCA_IFE_METALST); if (!nest) goto out_nlmsg_trim; list_for_each_entry(e, &ife->metalist, metalist) { if (!e->ops->get(skb, e)) total_encoded += 1; } if (!total_encoded) goto out_nlmsg_trim; nla_nest_end(skb, nest); return 0; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } /* under ife->tcf_lock */ static void _tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_meta_info *e, *n; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { list_del(&e->metalist); if (e->metaval) { if (e->ops->release) e->ops->release(e); else kfree(e->metaval); } module_put(e->ops->owner); kfree(e); } } static void tcf_ife_cleanup(struct tc_action *a) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a); spin_unlock_bh(&ife->tcf_lock); p = rcu_dereference_protected(ife->params, 1); if (p) kfree_rcu(p, rcu); } static int load_metalist(struct nlattr **tb, bool rtnl_held) { int i; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { void *val = nla_data(tb[i]); int len = nla_len(tb[i]); int rc; rc = load_metaops_and_vet(i, val, len, rtnl_held); if (rc != 0) return rc; } } return 0; } static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, bool exists, bool rtnl_held) { int len = 0; int rc = 0; int i = 0; void *val; for (i = 1; i < max_metacnt; i++) { if (tb[i]) { val = nla_data(tb[i]); len = nla_len(tb[i]); rc = add_metainfo(ife, i, val, len, exists); if (rc) return rc; } } return rc; } static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; u8 *daddr = NULL; u8 *saddr = NULL; bool exists = false; int ret = 0; u32 index; int err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy, NULL); if (err < 0) return err; if (!tb[TCA_IFE_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_IFE_PARMS]); /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because * they cannot run as the same time. Check on all other values which * are not supported right now. */ if (parm->flags & ~IFE_ENCODE) return -EINVAL; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; if (tb[TCA_IFE_METALST]) { err = nla_parse_nested_deprecated(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], NULL, NULL); if (err) { kfree(p); return err; } err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) { kfree(p); return err; } } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) { kfree(p); return err; } exists = err; if (exists && bind) { kfree(p); return ACT_P_BOUND; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_ife_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); kfree(p); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); kfree(p); return -EEXIST; } ife = to_ife(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) ife_type = nla_get_u16(tb[TCA_IFE_TYPE]); if (tb[TCA_IFE_DMAC]) daddr = nla_data(tb[TCA_IFE_DMAC]); if (tb[TCA_IFE_SMAC]) saddr = nla_data(tb[TCA_IFE_SMAC]); } if (parm->flags & IFE_ENCODE) { if (daddr) ether_addr_copy(p->eth_dst, daddr); else eth_zero_addr(p->eth_dst); if (saddr) ether_addr_copy(p->eth_src, saddr); else eth_zero_addr(p->eth_src); p->eth_type = ife_type; } if (tb[TCA_IFE_METALST]) { err = populate_metalist(ife, tb2, exists, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) goto metadata_parse_err; } else { /* if no passed metadata allow list or passed allow-all * then here we process by adding as many supported metadatum * as we can. You better have at least one else we are * going to bail out */ err = use_all_metadata(ife, exists); if (err) goto metadata_parse_err; } if (exists) spin_lock_bh(&ife->tcf_lock); /* protected by tcf_lock when modifying existing action */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(ife->params, p, 1); if (exists) spin_unlock_bh(&ife->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; metadata_parse_err: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: kfree(p); tcf_idr_release(*a, bind); return err; } static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; struct tc_ife opt = { .index = ife->tcf_index, .refcnt = refcount_read(&ife->tcf_refcnt) - ref, .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; p = rcu_dereference_protected(ife->params, lockdep_is_held(&ife->tcf_lock)); opt.flags = p->flags; if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &ife->tcf_tm); if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; if (!is_zero_ether_addr(p->eth_dst)) { if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } if (!is_zero_ether_addr(p->eth_src)) { if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { /*ignore failure to dump metalist */ pr_info("Failed to dump metalist\n"); } spin_unlock_bh(&ife->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&ife->tcf_lock); nlmsg_trim(skb, b); return -1; } static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; /* XXX: use hash to speed up */ list_for_each_entry(e, &ife->metalist, metalist) { if (metaid == e->metaid) { if (e->ops) { /* We check for decode presence already */ return e->ops->decode(skb, mdata, mlen); } } } return -ENOENT; } static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; u8 *ifehdr_end; u8 *tlv_data; u16 metalen; bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } ifehdr_end = tlv_data + metalen; for (; tlv_data < ifehdr_end; tlv_data = ife_tlv_meta_next(tlv_data)) { u8 *curr_data; u16 mtype; u16 dlen; curr_data = ife_tlv_meta_decode(tlv_data, ifehdr_end, &mtype, &dlen, NULL); if (!curr_data) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (find_decode_metaid(skb, ife, mtype, dlen, curr_data)) { /* abuse overlimits to count when we receive metadata * but dont have an ops for it */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skb->protocol = eth_type_trans(skb, skb->dev); skb_reset_network_header(skb); return action; } /*XXX: check if we can do this at install time instead of current * send data path **/ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) { struct tcf_meta_info *e, *n; int tot_run_sz = 0, run_sz = 0; list_for_each_entry_safe(e, n, &ife->metalist, metalist) { if (e->ops->check_presence) { run_sz = e->ops->check_presence(skb, e); tot_run_sz += run_sz; } } return tot_run_sz; } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; struct ethhdr *oethh; /* outer ether header */ struct tcf_meta_info *e; /* OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA where ORIGDATA = original ethernet header ... */ u16 metalen = ife_get_sz(skb, ife); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = 0; int new_len = skb->len + hdrm; bool exceed_mtu = false; void *ife_meta; int err = 0; if (!skb_at_tc_ingress(skb)) { if (new_len > skb->dev->mtu) exceed_mtu = true; } bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); ife_meta = ife_encode(skb, metalen); spin_lock(&ife->tcf_lock); /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... */ list_for_each_entry(e, &ife->metalist, metalist) { if (e->ops->encode) { err = e->ops->encode(skb, (void *)(ife_meta + skboff), e); } if (err < 0) { /* too corrupt to keep around if overwritten */ spin_unlock(&ife->tcf_lock); qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; } spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; if (!is_zero_ether_addr(p->eth_src)) ether_addr_copy(oethh->h_source, p->eth_src); if (!is_zero_ether_addr(p->eth_dst)) ether_addr_copy(oethh->h_dest, p->eth_dst); oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); return action; } TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; int ret; p = rcu_dereference_bh(ife->params); if (p->flags & IFE_ENCODE) { ret = tcf_ife_encode(skb, a, res, p); return ret; } return tcf_ife_decode(skb, a, res); } static struct tc_action_ops act_ife_ops = { .kind = "ife", .id = TCA_ID_IFE, .owner = THIS_MODULE, .act = tcf_ife_act, .dump = tcf_ife_dump, .cleanup = tcf_ife_cleanup, .init = tcf_ife_init, .size = sizeof(struct tcf_ife_info), }; MODULE_ALIAS_NET_ACT("ife"); static __net_init int ife_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); return tc_action_net_init(net, tn, &act_ife_ops); } static void __net_exit ife_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_ife_ops.net_id); } static struct pernet_operations ife_net_ops = { .init = ife_init_net, .exit_batch = ife_exit_net, .id = &act_ife_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init ife_init_module(void) { return tcf_register_action(&act_ife_ops, &ife_net_ops); } static void __exit ife_cleanup_module(void) { tcf_unregister_action(&act_ife_ops, &ife_net_ops); } module_init(ife_init_module); module_exit(ife_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE LFB action"); MODULE_LICENSE("GPL");
10 10 2 1 1 1 1 2 2 2 1 6 1 1 9 2 7 7 4 3 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 // SPDX-License-Identifier: GPL-2.0-only /* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation. * * Authors: Artem Bityutskiy (Битюцкий Артём) * Adrian Hunter */ /* * This file implements UBIFS initialization and VFS superblock operations. Some * initialization stuff which is rather large and complex is placed at * corresponding subsystems, but most of it is here. */ #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/kthread.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/writeback.h> #include "ubifs.h" static int ubifs_default_version_set(const char *val, const struct kernel_param *kp) { int n = 0, ret; ret = kstrtoint(val, 10, &n); if (ret != 0 || n < 4 || n > UBIFS_FORMAT_VERSION) return -EINVAL; return param_set_int(val, kp); } static const struct kernel_param_ops ubifs_default_version_ops = { .set = ubifs_default_version_set, .get = param_get_int, }; int ubifs_default_version = UBIFS_FORMAT_VERSION; module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_version, 0600); /* * Maximum amount of memory we may 'kmalloc()' without worrying that we are * allocating too much. */ #define UBIFS_KMALLOC_OK (128*1024) /* Slab cache for UBIFS inodes */ static struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ static struct shrinker *ubifs_shrinker_info; /** * validate_inode - validate inode. * @c: UBIFS file-system description object * @inode: the inode to validate * * This is a helper function for 'ubifs_iget()' which validates various fields * of a newly built inode to make sure they contain sane values and prevent * possible vulnerabilities. Returns zero if the inode is all right and * a non-zero error code if not. */ static int validate_inode(struct ubifs_info *c, const struct inode *inode) { int err; const struct ubifs_inode *ui = ubifs_inode(inode); if (inode->i_size > c->max_inode_sz) { ubifs_err(c, "inode is too large (%lld)", (long long)inode->i_size); return 1; } if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { ubifs_err(c, "unknown compression type %d", ui->compr_type); return 2; } if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX) return 3; if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) return 4; if (ui->xattr && !S_ISREG(inode->i_mode)) return 5; if (!ubifs_compr_present(c, ui->compr_type)) { ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in", inode->i_ino, ubifs_compr_name(c, ui->compr_type)); } err = dbg_check_dir(c, inode); return err; } struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) { int err; union ubifs_key key; struct ubifs_ino_node *ino; struct ubifs_info *c = sb->s_fs_info; struct inode *inode; struct ubifs_inode *ui; dbg_gen("inode %lu", inum); inode = iget_locked(sb, inum); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; ui = ubifs_inode(inode); ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); if (!ino) { err = -ENOMEM; goto out; } ino_key_init(c, &key, inode->i_ino); err = ubifs_tnc_lookup(c, &key, ino); if (err) goto out_ino; inode->i_flags |= S_NOCMTIME; if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) inode->i_flags |= S_NOATIME; set_nlink(inode, le32_to_cpu(ino->nlink)); i_uid_write(inode, le32_to_cpu(ino->uid)); i_gid_write(inode, le32_to_cpu(ino->gid)); inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec), le32_to_cpu(ino->atime_nsec)); inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec), le32_to_cpu(ino->mtime_nsec)); inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec), le32_to_cpu(ino->ctime_nsec)); inode->i_mode = le32_to_cpu(ino->mode); inode->i_size = le64_to_cpu(ino->size); ui->data_len = le32_to_cpu(ino->data_len); ui->flags = le32_to_cpu(ino->flags); ui->compr_type = le16_to_cpu(ino->compr_type); ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum); ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt); ui->xattr_size = le32_to_cpu(ino->xattr_size); ui->xattr_names = le32_to_cpu(ino->xattr_names); ui->synced_i_size = ui->ui_size = inode->i_size; ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0; err = validate_inode(c, inode); if (err) goto out_invalid; switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; inode->i_op = &ubifs_file_inode_operations; inode->i_fop = &ubifs_file_operations; if (ui->xattr) { ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; } else if (ui->data_len != 0) { err = 10; goto out_invalid; } break; case S_IFDIR: inode->i_op = &ubifs_dir_inode_operations; inode->i_fop = &ubifs_dir_operations; if (ui->data_len != 0) { err = 11; goto out_invalid; } break; case S_IFLNK: inode->i_op = &ubifs_symlink_inode_operations; if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) { err = 12; goto out_invalid; } ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; break; case S_IFBLK: case S_IFCHR: { dev_t rdev; union ubifs_dev_desc *dev; ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_ino; } dev = (union ubifs_dev_desc *)ino->data; if (ui->data_len == sizeof(dev->new)) rdev = new_decode_dev(le32_to_cpu(dev->new)); else if (ui->data_len == sizeof(dev->huge)) rdev = huge_decode_dev(le64_to_cpu(dev->huge)); else { err = 13; goto out_invalid; } memcpy(ui->data, ino->data, ui->data_len); inode->i_op = &ubifs_file_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } case S_IFSOCK: case S_IFIFO: inode->i_op = &ubifs_file_inode_operations; init_special_inode(inode, inode->i_mode, 0); if (ui->data_len != 0) { err = 14; goto out_invalid; } break; default: err = 15; goto out_invalid; } kfree(ino); ubifs_set_inode_flags(inode); unlock_new_inode(inode); return inode; out_invalid: ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); out: ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err); iget_failed(inode); return ERR_PTR(err); } static struct inode *ubifs_alloc_inode(struct super_block *sb) { struct ubifs_inode *ui; ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS); if (!ui) return NULL; memset((void *)ui + sizeof(struct inode), 0, sizeof(struct ubifs_inode) - sizeof(struct inode)); mutex_init(&ui->ui_mutex); init_rwsem(&ui->xattr_sem); spin_lock_init(&ui->ui_lock); return &ui->vfs_inode; }; static void ubifs_free_inode(struct inode *inode) { struct ubifs_inode *ui = ubifs_inode(inode); kfree(ui->data); fscrypt_free_inode(inode); kmem_cache_free(ubifs_inode_slab, ui); } /* * Note, Linux write-back code calls this without 'i_mutex'. */ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) { int err = 0; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); ubifs_assert(c, !ui->xattr); if (is_bad_inode(inode)) return 0; mutex_lock(&ui->ui_mutex); /* * Due to races between write-back forced by budgeting * (see 'sync_some_inodes()') and background write-back, the inode may * have already been synchronized, do not do this again. This might * also happen if it was synchronized in an VFS operation, e.g. * 'ubifs_link()'. */ if (!ui->dirty) { mutex_unlock(&ui->ui_mutex); return 0; } /* * As an optimization, do not write orphan inodes to the media just * because this is not needed. */ dbg_gen("inode %lu, mode %#x, nlink %u", inode->i_ino, (int)inode->i_mode, inode->i_nlink); if (inode->i_nlink) { err = ubifs_jnl_write_inode(c, inode); if (err) ubifs_err(c, "can't write inode %lu, error %d", inode->i_ino, err); else err = dbg_check_inode_size(c, inode, ui->ui_size); } ui->dirty = 0; mutex_unlock(&ui->ui_mutex); ubifs_release_dirty_inode_budget(c, ui); return err; } static int ubifs_drop_inode(struct inode *inode) { int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); return drop; } static void ubifs_evict_inode(struct inode *inode) { int err; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); if (ui->xattr) /* * Extended attribute inode deletions are fully handled in * 'ubifs_removexattr()'. These inodes are special and have * limited usage, so there is nothing to do here. */ goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); if (inode->i_nlink) goto done; if (is_bad_inode(inode)) goto out; ui->ui_size = inode->i_size = 0; err = ubifs_jnl_delete_inode(c, inode); if (err) /* * Worst case we have a lost orphan inode wasting space, so a * simple error message is OK here. */ ubifs_err(c, "can't delete inode %lu, error %d", inode->i_ino, err); out: if (ui->dirty) ubifs_release_dirty_inode_budget(c, ui); else { /* We've deleted something - clean the "no space" flags */ c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } done: clear_inode(inode); fscrypt_put_encryption_info(inode); } static void ubifs_dirty_inode(struct inode *inode, int flags) { struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); if (!ui->dirty) { ui->dirty = 1; dbg_gen("inode %lu", inode->i_ino); } } static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; __le32 *uuid = (__le32 *)c->uuid; free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); buf->f_type = UBIFS_SUPER_MAGIC; buf->f_bsize = UBIFS_BLOCK_SIZE; buf->f_blocks = c->block_cnt; buf->f_bfree = free >> UBIFS_BLOCK_SHIFT; if (free > c->report_rp_size) buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT; else buf->f_bavail = 0; buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); ubifs_assert(c, buf->f_bfree <= c->block_cnt); return 0; } static int ubifs_show_options(struct seq_file *s, struct dentry *root) { struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) seq_puts(s, ",fast_unmount"); else if (c->mount_opts.unmount_mode == 1) seq_puts(s, ",norm_unmount"); if (c->mount_opts.bulk_read == 2) seq_puts(s, ",bulk_read"); else if (c->mount_opts.bulk_read == 1) seq_puts(s, ",no_bulk_read"); if (c->mount_opts.chk_data_crc == 2) seq_puts(s, ",chk_data_crc"); else if (c->mount_opts.chk_data_crc == 1) seq_puts(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { seq_printf(s, ",compr=%s", ubifs_compr_name(c, c->mount_opts.compr_type)); } seq_printf(s, ",assert=%s", ubifs_assert_action_name(c)); seq_printf(s, ",ubi=%d,vol=%d", c->vi.ubi_num, c->vi.vol_id); return 0; } static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; /* * Zero @wait is just an advisory thing to help the file system shove * lots of data into the queues, and there will be the second * '->sync_fs()' call, with non-zero @wait. */ if (!wait) return 0; /* * Synchronize write buffers, because 'ubifs_run_commit()' does not * do this if it waits for an already running commit. */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) return err; } /* * Strictly speaking, it is not necessary to commit the journal here, * synchronizing write-buffers would be enough. But committing makes * UBIFS free space predictions much more accurate, so we want to let * the user be able to get more accurate results of 'statfs()' after * they synchronize the file system. */ err = ubifs_run_commit(c); if (err) return err; return ubi_sync(c->vi.ubi_num); } /** * init_constants_early - initialize UBIFS constants. * @c: UBIFS file-system description object * * This function initialize UBIFS constants which do not need the superblock to * be read. It also checks that the UBI volume satisfies basic UBIFS * requirements. Returns zero in case of success and a negative error code in * case of failure. */ static int init_constants_early(struct ubifs_info *c) { if (c->vi.corrupted) { ubifs_warn(c, "UBI volume is corrupted - read-only mode"); c->ro_media = 1; } if (c->di.ro_mode) { ubifs_msg(c, "read-only UBI device"); c->ro_media = 1; } if (c->vi.vol_type == UBI_STATIC_VOLUME) { ubifs_msg(c, "static UBI volume - read-only mode"); c->ro_media = 1; } c->leb_cnt = c->vi.size; c->leb_size = c->vi.usable_leb_size; c->leb_start = c->di.leb_start; c->half_leb_size = c->leb_size / 2; c->min_io_size = c->di.min_io_size; c->min_io_shift = fls(c->min_io_size) - 1; c->max_write_size = c->di.max_write_size; c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes", c->leb_size, UBIFS_MIN_LEB_SZ); return -EINVAL; } if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { ubifs_errc(c, "too few LEBs (%d), min. is %d", c->leb_cnt, UBIFS_MIN_LEB_CNT); return -EINVAL; } if (!is_power_of_2(c->min_io_size)) { ubifs_errc(c, "bad min. I/O size %d", c->min_io_size); return -EINVAL; } /* * Maximum write size has to be greater or equivalent to min. I/O * size, and be multiple of min. I/O size. */ if (c->max_write_size < c->min_io_size || c->max_write_size % c->min_io_size || !is_power_of_2(c->max_write_size)) { ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit", c->max_write_size, c->min_io_size); return -EINVAL; } /* * UBIFS aligns all node to 8-byte boundary, so to make function in * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is * less than 8. */ if (c->min_io_size < 8) { c->min_io_size = 8; c->min_io_shift = 3; if (c->max_write_size < c->min_io_size) { c->max_write_size = c->min_io_size; c->max_write_shift = c->min_io_shift; } } c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size); /* * Initialize node length ranges which are mostly needed for node * length validation. */ c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ; c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ; c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ; c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; c->ranges[UBIFS_AUTH_NODE].min_len = UBIFS_AUTH_NODE_SZ; c->ranges[UBIFS_AUTH_NODE].max_len = UBIFS_AUTH_NODE_SZ + UBIFS_MAX_HMAC_LEN; c->ranges[UBIFS_SIG_NODE].min_len = UBIFS_SIG_NODE_SZ; c->ranges[UBIFS_SIG_NODE].max_len = c->leb_size - UBIFS_SB_NODE_SZ; c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; c->ranges[UBIFS_ORPH_NODE].min_len = UBIFS_ORPH_NODE_SZ + sizeof(__le64); c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size; c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ; c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ; c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ; c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ; c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ; c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ; /* * Minimum indexing node size is amended later when superblock is * read and the key length is known. */ c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ; /* * Maximum indexing node size is amended later when superblock is * read and the fanout is known. */ c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; /* * Initialize dead and dark LEB space watermarks. See gc.c for comments * about these values. */ c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); /* * Calculate how many bytes would be wasted at the end of LEB if it was * fully filled with data nodes of maximum size. This is used in * calculations when reporting free space. */ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; /* Buffer size for bulk-reads */ c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; if (c->max_bu_buf_len > c->leb_size) c->max_bu_buf_len = c->leb_size; /* Log is ready, preserve one LEB for commits. */ c->min_log_bytes = c->leb_size; return 0; } /** * bud_wbuf_callback - bud LEB write-buffer synchronization call-back. * @c: UBIFS file-system description object * @lnum: LEB the write-buffer was synchronized to * @free: how many free bytes left in this LEB * @pad: how many bytes were padded * * This is a callback function which is called by the I/O unit when the * write-buffer is synchronized. We need this to correctly maintain space * accounting in bud logical eraseblocks. This function returns zero in case of * success and a negative error code in case of failure. * * This function actually belongs to the journal, but we keep it here because * we want to keep it static. */ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) { return ubifs_update_one_lp(c, lnum, free, pad, 0, 0); } /* * init_constants_sb - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after * the superblock has been read. It also checks various UBIFS parameters and * makes sure they are all right. Returns zero in case of success and a * negative error code in case of failure. */ static int init_constants_sb(struct ubifs_info *c) { int tmp, err; long long tmp64; c->main_bytes = (long long)c->main_lebs * c->leb_size; c->max_znode_sz = sizeof(struct ubifs_znode) + c->fanout * sizeof(struct ubifs_zbranch); tmp = ubifs_idx_node_sz(c, 1); c->ranges[UBIFS_IDX_NODE].min_len = tmp; c->min_idx_node_sz = ALIGN(tmp, 8); tmp = ubifs_idx_node_sz(c, c->fanout); c->ranges[UBIFS_IDX_NODE].max_len = tmp; c->max_idx_node_sz = ALIGN(tmp, 8); /* Make sure LEB size is large enough to fit full commit */ tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { ubifs_err(c, "too small LEB size %d, at least %d needed", c->leb_size, tmp); return -EINVAL; } /* * Make sure that the log is large enough to fit reference nodes for * all buds plus one reserved LEB. */ tmp64 = c->max_bud_bytes + c->leb_size - 1; c->max_bud_cnt = div_u64(tmp64, c->leb_size); tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { ubifs_err(c, "too small log %d LEBs, required min. %d LEBs", c->log_lebs, tmp); return -EINVAL; } /* * When budgeting we assume worst-case scenarios when the pages are not * be compressed and direntries are of the maximum size. * * Note, data, which may be stored in inodes is budgeted separately, so * it is not included into 'c->bi.inode_budget'. */ c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; c->bi.inode_budget = UBIFS_INO_NODE_SZ; c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; /* * When the amount of flash space used by buds becomes * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit. * The writers are unblocked when the commit is finished. To avoid * writers to be blocked UBIFS initiates background commit in advance, * when number of bud bytes becomes above the limit defined below. */ c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4; /* * Ensure minimum journal size. All the bytes in the journal heads are * considered to be used, when calculating the current journal usage. * Consequently, if the journal is too small, UBIFS will treat it as * always full. */ tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; if (c->bg_bud_bytes < tmp64) c->bg_bud_bytes = tmp64; if (c->max_bud_bytes < tmp64 + c->leb_size) c->max_bud_bytes = tmp64 + c->leb_size; err = ubifs_calc_lpt_geom(c); if (err) return err; /* Initialize effective LEB size used in budgeting calculations */ c->idx_leb_size = c->leb_size - c->max_idx_node_sz; return 0; } /* * init_constants_master - initialize UBIFS constants. * @c: UBIFS file-system description object * * This is a helper function which initializes various UBIFS constants after * the master node has been read. It also checks various UBIFS parameters and * makes sure they are all right. */ static void init_constants_master(struct ubifs_info *c) { long long tmp64; c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* * Calculate total amount of FS blocks. This number is not used * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * * Subtract the LEB reserved for GC, the LEB which is reserved for * deletions, minimum LEBs for the index, the LEBs which are reserved * for each journal head. */ tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt; tmp64 *= (long long)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; } /** * take_gc_lnum - reserve GC LEB. * @c: UBIFS file-system description object * * This function ensures that the LEB reserved for garbage collection is marked * as "taken" in lprops. We also have to set free space to LEB size and dirty * space to zero, because lprops may contain out-of-date information if the * file-system was un-mounted before it has been committed. This function * returns zero in case of success and a negative error code in case of * failure. */ static int take_gc_lnum(struct ubifs_info *c) { int err; if (c->gc_lnum == -1) { ubifs_err(c, "no LEB for GC"); return -EINVAL; } /* And we have to tell lprops that this LEB is taken */ err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, LPROPS_TAKEN, 0, 0); return err; } /** * alloc_wbufs - allocate write-buffers. * @c: UBIFS file-system description object * * This helper function allocates and initializes UBIFS write-buffers. Returns * zero in case of success and %-ENOMEM in case of failure. */ static int alloc_wbufs(struct ubifs_info *c) { int i, err; c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead), GFP_KERNEL); if (!c->jheads) return -ENOMEM; /* Initialize journal heads */ for (i = 0; i < c->jhead_cnt; i++) { INIT_LIST_HEAD(&c->jheads[i].buds_list); err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); if (err) goto out_wbuf; c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; c->jheads[i].wbuf.jhead = i; c->jheads[i].grouped = 1; c->jheads[i].log_hash = ubifs_hash_get_desc(c); if (IS_ERR(c->jheads[i].log_hash)) { err = PTR_ERR(c->jheads[i].log_hash); goto out_log_hash; } } /* * Garbage Collector head does not need to be synchronized by timer. * Also GC head nodes are not grouped. */ c->jheads[GCHD].wbuf.no_timer = 1; c->jheads[GCHD].grouped = 0; return 0; out_log_hash: kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); out_wbuf: while (i--) { kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); kfree(c->jheads[i].log_hash); } kfree(c->jheads); c->jheads = NULL; return err; } /** * free_wbufs - free write-buffers. * @c: UBIFS file-system description object */ static void free_wbufs(struct ubifs_info *c) { int i; if (c->jheads) { for (i = 0; i < c->jhead_cnt; i++) { kfree(c->jheads[i].wbuf.buf); kfree(c->jheads[i].wbuf.inodes); kfree(c->jheads[i].log_hash); } kfree(c->jheads); c->jheads = NULL; } } /** * free_orphans - free orphans. * @c: UBIFS file-system description object */ static void free_orphans(struct ubifs_info *c) { struct ubifs_orphan *orph; while (c->orph_dnext) { orph = c->orph_dnext; c->orph_dnext = orph->dnext; list_del(&orph->list); kfree(orph); } while (!list_empty(&c->orph_list)) { orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); ubifs_err(c, "orphan list not empty at unmount"); } vfree(c->orph_buf); c->orph_buf = NULL; } /** * free_buds - free per-bud objects. * @c: UBIFS file-system description object */ static void free_buds(struct ubifs_info *c) { struct ubifs_bud *bud, *n; rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) { kfree(bud->log_hash); kfree(bud); } } /** * check_volume_empty - check if the UBI volume is empty. * @c: UBIFS file-system description object * * This function checks if the UBIFS volume is empty by looking if its LEBs are * mapped or not. The result of checking is stored in the @c->empty variable. * Returns zero in case of success and a negative error code in case of * failure. */ static int check_volume_empty(struct ubifs_info *c) { int lnum, err; c->empty = 1; for (lnum = 0; lnum < c->leb_cnt; lnum++) { err = ubifs_is_mapped(c, lnum); if (unlikely(err < 0)) return err; if (err == 1) { c->empty = 0; break; } cond_resched(); } return 0; } /* * UBIFS mount options. * * Opt_fast_unmount: do not run a journal commit before un-mounting * Opt_norm_unmount: run a journal commit before un-mounting * Opt_bulk_read: enable bulk-reads * Opt_no_bulk_read: disable bulk-reads * Opt_chk_data_crc: check CRCs when reading data nodes * Opt_no_chk_data_crc: do not check CRCs when reading data nodes * Opt_override_compr: override default compressor * Opt_assert: set ubifs_assert() action * Opt_auth_key: The key name used for authentication * Opt_auth_hash_name: The hash type used for authentication * Opt_err: just end of array marker */ enum { Opt_fast_unmount, Opt_norm_unmount, Opt_bulk_read, Opt_no_bulk_read, Opt_chk_data_crc, Opt_no_chk_data_crc, Opt_override_compr, Opt_assert, Opt_auth_key, Opt_auth_hash_name, Opt_ignore, }; static const struct constant_table ubifs_param_compr[] = { { "none", UBIFS_COMPR_NONE }, { "lzo", UBIFS_COMPR_LZO }, { "zlib", UBIFS_COMPR_ZLIB }, { "zstd", UBIFS_COMPR_ZSTD }, {} }; static const struct constant_table ubifs_param_assert[] = { { "report", ASSACT_REPORT }, { "read-only", ASSACT_RO }, { "panic", ASSACT_PANIC }, {} }; static const struct fs_parameter_spec ubifs_fs_param_spec[] = { fsparam_flag ("fast_unmount", Opt_fast_unmount), fsparam_flag ("norm_unmount", Opt_norm_unmount), fsparam_flag ("bulk_read", Opt_bulk_read), fsparam_flag ("no_bulk_read", Opt_no_bulk_read), fsparam_flag ("chk_data_crc", Opt_chk_data_crc), fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc), fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr), fsparam_enum ("assert", Opt_assert, ubifs_param_assert), fsparam_string ("auth_key", Opt_auth_key), fsparam_string ("auth_hash_name", Opt_auth_hash_name), fsparam_string ("ubi", Opt_ignore), fsparam_string ("vol", Opt_ignore), {} }; struct ubifs_fs_context { struct ubifs_mount_opts mount_opts; char *auth_key_name; char *auth_hash_name; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; unsigned int assert_action:2; }; /** * ubifs_parse_param - parse a parameter. * @fc: the filesystem context * @param: the parameter to parse * * This function parses UBIFS mount options and returns zero in case success * and a negative error code in case of failure. */ static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ubifs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); int opt; opt = fs_parse(fc, ubifs_fs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { /* * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. * We accept them in order to be backward-compatible. But this * should be removed at some point. */ case Opt_fast_unmount: ctx->mount_opts.unmount_mode = 2; break; case Opt_norm_unmount: ctx->mount_opts.unmount_mode = 1; break; case Opt_bulk_read: ctx->mount_opts.bulk_read = 2; ctx->bulk_read = 1; break; case Opt_no_bulk_read: ctx->mount_opts.bulk_read = 1; ctx->bulk_read = 0; break; case Opt_chk_data_crc: ctx->mount_opts.chk_data_crc = 2; ctx->no_chk_data_crc = 0; break; case Opt_no_chk_data_crc: ctx->mount_opts.chk_data_crc = 1; ctx->no_chk_data_crc = 1; break; case Opt_override_compr: ctx->mount_opts.compr_type = result.uint_32; ctx->mount_opts.override_compr = 1; ctx->default_compr = ctx->mount_opts.compr_type; break; case Opt_assert: ctx->assert_action = result.uint_32; break; case Opt_auth_key: if (!is_remount) { kfree(ctx->auth_key_name); ctx->auth_key_name = param->string; param->string = NULL; } break; case Opt_auth_hash_name: if (!is_remount) { kfree(ctx->auth_hash_name); ctx->auth_hash_name = param->string; param->string = NULL; } break; case Opt_ignore: break; } return 0; } /* * ubifs_release_options - release mount parameters which have been dumped. * @c: UBIFS file-system description object */ static void ubifs_release_options(struct ubifs_info *c) { kfree(c->auth_key_name); c->auth_key_name = NULL; kfree(c->auth_hash_name); c->auth_hash_name = NULL; } /** * destroy_journal - destroy journal data structures. * @c: UBIFS file-system description object * * This function destroys journal data structures including those that may have * been created by recovery functions. */ static void destroy_journal(struct ubifs_info *c) { while (!list_empty(&c->unclean_leb_list)) { struct ubifs_unclean_leb *ucleb; ucleb = list_entry(c->unclean_leb_list.next, struct ubifs_unclean_leb, list); list_del(&ucleb->list); kfree(ucleb); } while (!list_empty(&c->old_buds)) { struct ubifs_bud *bud; bud = list_entry(c->old_buds.next, struct ubifs_bud, list); list_del(&bud->list); kfree(bud->log_hash); kfree(bud); } ubifs_destroy_idx_gc(c); ubifs_destroy_size_tree(c); ubifs_tnc_close(c); free_buds(c); } /** * bu_init - initialize bulk-read information. * @c: UBIFS file-system description object */ static void bu_init(struct ubifs_info *c) { ubifs_assert(c, c->bulk_read == 1); if (c->bu.buf) return; /* Already initialized */ again: c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN); if (!c->bu.buf) { if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) { c->max_bu_buf_len = UBIFS_KMALLOC_OK; goto again; } /* Just disable bulk-read */ ubifs_warn(c, "cannot allocate %d bytes of memory for bulk-read, disabling it", c->max_bu_buf_len); c->mount_opts.bulk_read = 1; c->bulk_read = 0; return; } } /** * check_free_space - check if there is enough free space to mount. * @c: UBIFS file-system description object * * This function makes sure UBIFS has enough free space to be mounted in * read/write mode. UBIFS must always have some free space to allow deletions. */ static int check_free_space(struct ubifs_info *c) { ubifs_assert(c, c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { ubifs_err(c, "insufficient free space to mount in R/W mode"); ubifs_dump_budg(c, &c->bi); ubifs_dump_lprops(c); return -ENOSPC; } return 0; } /** * mount_ubifs - mount UBIFS file-system. * @c: UBIFS file-system description object * * This function mounts UBIFS file system. Returns zero in case of success and * a negative error code in case of failure. */ static int mount_ubifs(struct ubifs_info *c) { int err; long long x, y; size_t sz; c->ro_mount = !!sb_rdonly(c->vfs_sb); /* Suppress error messages while probing if SB_SILENT is set */ c->probing = !!(c->vfs_sb->s_flags & SB_SILENT); err = init_constants_early(c); if (err) return err; err = ubifs_debugging_init(c); if (err) return err; err = ubifs_sysfs_register(c); if (err) goto out_debugging; err = check_volume_empty(c); if (err) goto out_free; if (c->empty && (c->ro_mount || c->ro_media)) { /* * This UBI volume is empty, and read-only, or the file system * is mounted read-only - we cannot format it. */ ubifs_err(c, "can't format empty UBI volume: read-only %s", c->ro_media ? "UBI volume" : "mount"); err = -EROFS; goto out_free; } if (c->ro_media && !c->ro_mount) { ubifs_err(c, "cannot mount read-write - read-only media"); err = -EROFS; goto out_free; } /* * The requirement for the buffer is that it should fit indexing B-tree * height amount of integers. We assume the height if the TNC tree will * never exceed 64. */ err = -ENOMEM; c->bottom_up_buf = kmalloc_array(BOTTOM_UP_HEIGHT, sizeof(int), GFP_KERNEL); if (!c->bottom_up_buf) goto out_free; c->sbuf = vmalloc(c->leb_size); if (!c->sbuf) goto out_free; if (!c->ro_mount) { c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) goto out_free; } if (c->bulk_read == 1) bu_init(c); if (!c->ro_mount) { c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) goto out_free; } c->mounting = 1; if (c->auth_key_name) { if (IS_ENABLED(CONFIG_UBIFS_FS_AUTHENTICATION)) { err = ubifs_init_authentication(c); if (err) goto out_free; } else { ubifs_err(c, "auth_key_name, but UBIFS is built without" " authentication support"); err = -EINVAL; goto out_free; } } err = ubifs_read_superblock(c); if (err) goto out_auth; c->probing = 0; /* * Make sure the compressor which is set as default in the superblock * or overridden by mount options is actually compiled in. */ if (!ubifs_compr_present(c, c->default_compr)) { ubifs_err(c, "'compressor \"%s\" is not compiled in", ubifs_compr_name(c, c->default_compr)); err = -ENOTSUPP; goto out_auth; } err = init_constants_sb(c); if (err) goto out_auth; sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2; c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; goto out_auth; } err = alloc_wbufs(c); if (err) goto out_cbuf; sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!c->ro_mount) { /* Create background thread */ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out_wbufs; } } err = ubifs_read_master(c); if (err) goto out_master; init_constants_master(c); if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { ubifs_msg(c, "recovery needed"); c->need_recovery = 1; } if (c->need_recovery && !c->ro_mount) { err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out_master; } err = ubifs_lpt_init(c, 1, !c->ro_mount); if (err) goto out_master; if (!c->ro_mount && c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) goto out_lpt; } if (!c->ro_mount && !c->need_recovery) { /* * Set the "dirty" flag so that if we reboot uncleanly we * will notice this immediately on the next mount. */ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) goto out_lpt; } /* * Handle offline signed images: Now that the master node is * written and its validation no longer depends on the hash * in the superblock, we can update the offline signed * superblock with a HMAC version, */ if (ubifs_authenticated(c) && ubifs_hmac_zero(c, c->sup_node->hmac)) { err = ubifs_hmac_wkm(c, c->sup_node->hmac_wkm); if (err) goto out_lpt; c->superblock_need_write = 1; } if (!c->ro_mount && c->superblock_need_write) { err = ubifs_write_sb_node(c, c->sup_node); if (err) goto out_lpt; c->superblock_need_write = 0; } err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; err = ubifs_replay_journal(c); if (err) goto out_journal; /* Calculate 'min_idx_lebs' after journal replay */ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); if (err) goto out_orphans; if (!c->ro_mount) { int lnum; err = check_free_space(c); if (err) goto out_orphans; /* Check for enough log space */ lnum = c->lhead_lnum + 1; if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) lnum = UBIFS_LOG_LNUM; if (lnum == c->ltail_lnum) { err = ubifs_consolidate_log(c); if (err) goto out_orphans; } if (c->need_recovery) { if (!ubifs_authenticated(c)) { err = ubifs_recover_size(c, true); if (err) goto out_orphans; } err = ubifs_rcvry_gc_commit(c); if (err) goto out_orphans; if (ubifs_authenticated(c)) { err = ubifs_recover_size(c, false); if (err) goto out_orphans; } } else { err = take_gc_lnum(c); if (err) goto out_orphans; /* * GC LEB may contain garbage if there was an unclean * reboot, and it should be un-mapped. */ err = ubifs_leb_unmap(c, c->gc_lnum); if (err) goto out_orphans; } err = dbg_check_lprops(c); if (err) goto out_orphans; } else if (c->need_recovery) { err = ubifs_recover_size(c, false); if (err) goto out_orphans; } else { /* * Even if we mount read-only, we have to set space in GC LEB * to proper value because this affects UBIFS free space * reporting. We do not want to have a situation when * re-mounting from R/O to R/W changes amount of free space. */ err = take_gc_lnum(c); if (err) goto out_orphans; } spin_lock(&ubifs_infos_lock); list_add_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); if (c->need_recovery) { if (c->ro_mount) ubifs_msg(c, "recovery deferred"); else { c->need_recovery = 0; ubifs_msg(c, "recovery completed"); /* * GC LEB has to be empty and taken at this point. But * the journal head LEBs may also be accounted as * "empty taken" if they are empty. */ ubifs_assert(c, c->lst.taken_empty_lebs > 0); } } else ubifs_assert(c, c->lst.taken_empty_lebs > 0); err = dbg_check_filesystem(c); if (err) goto out_infos; dbg_debugfs_init_fs(c); c->mounting = 0; ubifs_msg(c, "UBIFS: mounted UBI device %d, volume %d, name \"%s\"%s", c->vi.ubi_num, c->vi.vol_id, c->vi.name, c->ro_mount ? ", R/O mode" : ""); x = (long long)c->main_lebs * c->leb_size; y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", c->leb_size, c->leb_size >> 10, c->min_io_size, c->max_write_size); ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), max %d LEBs, journal size %lld bytes (%lld MiB, %d LEBs)", x, x >> 20, c->main_lebs, c->max_leb_cnt, y, y >> 20, c->log_lebs + c->max_bud_cnt); ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); ubifs_msg(c, "media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, c->big_lpt ? ", big LPT model" : ", small LPT model"); dbg_gen("default compressor: %s", ubifs_compr_name(c, c->default_compr)); dbg_gen("data journal heads: %d", c->jhead_cnt - NONDATA_JHEADS_CNT); dbg_gen("log LEBs: %d (%d - %d)", c->log_lebs, UBIFS_LOG_LNUM, c->log_last); dbg_gen("LPT area LEBs: %d (%d - %d)", c->lpt_lebs, c->lpt_first, c->lpt_last); dbg_gen("orphan area LEBs: %d (%d - %d)", c->orph_lebs, c->orph_first, c->orph_last); dbg_gen("main area LEBs: %d (%d - %d)", c->main_lebs, c->main_first, c->leb_cnt - 1); dbg_gen("index LEBs: %d", c->lst.idx_lebs); dbg_gen("total index bytes: %llu (%llu KiB, %llu MiB)", c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, c->bi.old_idx_sz >> 20); dbg_gen("key hash type: %d", c->key_hash_type); dbg_gen("tree fanout: %d", c->fanout); dbg_gen("reserved GC LEB: %d", c->gc_lnum); dbg_gen("max. znode size %d", c->max_znode_sz); dbg_gen("max. index node size %d", c->max_idx_node_sz); dbg_gen("node sizes: data %zu, inode %zu, dentry %zu", UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); dbg_gen("node sizes: trun %zu, sb %zu, master %zu", UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); dbg_gen("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); dbg_gen("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_gen("dead watermark: %d", c->dead_wm); dbg_gen("dark watermark: %d", c->dark_wm); dbg_gen("LEB overhead: %d", c->leb_overhead); x = (long long)c->main_lebs * c->dark_wm; dbg_gen("max. dark space: %lld (%lld KiB, %lld MiB)", x, x >> 10, x >> 20); dbg_gen("maximum bud bytes: %lld (%lld KiB, %lld MiB)", c->max_bud_bytes, c->max_bud_bytes >> 10, c->max_bud_bytes >> 20); dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", c->bg_bud_bytes, c->bg_bud_bytes >> 10, c->bg_bud_bytes >> 20); dbg_gen("current bud bytes %lld (%lld KiB, %lld MiB)", c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); dbg_gen("max. seq. number: %llu", c->max_sqnum); dbg_gen("commit number: %llu", c->cmt_no); dbg_gen("max. xattrs per inode: %d", ubifs_xattr_max_cnt(c)); dbg_gen("max orphans: %d", c->max_orphans); return 0; out_infos: spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); out_orphans: free_orphans(c); out_journal: destroy_journal(c); out_lpt: ubifs_lpt_free(c, 0); out_master: kfree(c->mst_node); kfree(c->rcvrd_mst_node); if (c->bgt) kthread_stop(c->bgt); out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); out_auth: ubifs_exit_authentication(c); out_free: kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_sysfs_unregister(c); out_debugging: ubifs_debugging_exit(c); return err; } /** * ubifs_umount - un-mount UBIFS file-system. * @c: UBIFS file-system description object * * Note, this function is called to free allocated resourced when un-mounting, * as well as free resources when an error occurred while we were half way * through mounting (error path cleanup function). So it has to make sure the * resource was actually allocated before freeing it. */ static void ubifs_umount(struct ubifs_info *c) { dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); dbg_debugfs_exit_fs(c); spin_lock(&ubifs_infos_lock); list_del(&c->infos_list); spin_unlock(&ubifs_infos_lock); if (c->bgt) kthread_stop(c->bgt); destroy_journal(c); free_wbufs(c); free_orphans(c); ubifs_lpt_free(c, 0); ubifs_exit_authentication(c); ubifs_release_options(c); kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); kfree(c->write_reserve_buf); kfree(c->bu.buf); vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); kfree(c->sup_node); ubifs_debugging_exit(c); ubifs_sysfs_unregister(c); } /** * ubifs_remount_rw - re-mount in read-write mode. * @c: UBIFS file-system description object * * UBIFS avoids allocating many unnecessary resources when mounted in read-only * mode. This function allocates the needed resources and re-mounts UBIFS in * read-write mode. */ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; if (c->rw_incompat) { ubifs_err(c, "the file-system is not R/W-compatible"); ubifs_msg(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); return -EROFS; } mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; c->ro_mount = 0; if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) goto out; } err = check_free_space(c); if (err) goto out; if (c->need_recovery) { ubifs_msg(c, "completing deferred recovery"); err = ubifs_write_rcvrd_mst_node(c); if (err) goto out; if (!ubifs_authenticated(c)) { err = ubifs_recover_size(c, true); if (err) goto out; } err = ubifs_clean_lebs(c, c->sbuf); if (err) goto out; err = ubifs_recover_inl_heads(c, c->sbuf); if (err) goto out; } else { /* A readonly mount is not allowed to have orphans */ ubifs_assert(c, c->tot_orphans == 0); err = ubifs_clear_orphans(c); if (err) goto out; } if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); err = ubifs_write_master(c); if (err) goto out; } if (c->superblock_need_write) { struct ubifs_sb_node *sup = c->sup_node; err = ubifs_write_sb_node(c, sup); if (err) goto out; c->superblock_need_write = 0; } c->ileb_buf = vmalloc(c->leb_size); if (!c->ileb_buf) { err = -ENOMEM; goto out; } c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) { err = -ENOMEM; goto out; } err = ubifs_lpt_init(c, 0, 1); if (err) goto out; /* Create background thread */ c->bgt = kthread_run(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out; } c->orph_buf = vmalloc(c->leb_size); if (!c->orph_buf) { err = -ENOMEM; goto out; } /* Check for enough log space */ lnum = c->lhead_lnum + 1; if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) lnum = UBIFS_LOG_LNUM; if (lnum == c->ltail_lnum) { err = ubifs_consolidate_log(c); if (err) goto out; } if (c->need_recovery) { err = ubifs_rcvry_gc_commit(c); if (err) goto out; if (ubifs_authenticated(c)) { err = ubifs_recover_size(c, false); if (err) goto out; } } else { err = ubifs_leb_unmap(c, c->gc_lnum); } if (err) goto out; dbg_gen("re-mounted read-write"); c->remounting_rw = 0; if (c->need_recovery) { c->need_recovery = 0; ubifs_msg(c, "deferred recovery completed"); } else { /* * Do not run the debugging space check if the were doing * recovery, because when we saved the information we had the * file-system in a state where the TNC and lprops has been * modified in memory, but all the I/O operations (including a * commit) were deferred. So the file-system was in * "non-committed" state. Now the file-system is in committed * state, and of course the amount of free space will change * because, for example, the old index size was imprecise. */ err = dbg_check_space_info(c); } mutex_unlock(&c->umount_mutex); return err; out: c->ro_mount = 1; vfree(c->orph_buf); c->orph_buf = NULL; if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->remounting_rw = 0; mutex_unlock(&c->umount_mutex); return err; } /** * ubifs_remount_ro - re-mount in read-only mode. * @c: UBIFS file-system description object * * We assume VFS has stopped writing. Possibly the background thread could be * running a commit, however kthread_stop will wait in that case. */ static void ubifs_remount_ro(struct ubifs_info *c) { int i, err; ubifs_assert(c, !c->need_recovery); ubifs_assert(c, !c->ro_mount); mutex_lock(&c->umount_mutex); if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } dbg_save_space_info(c); for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) ubifs_ro_mode(c, err); } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); err = ubifs_write_master(c); if (err) ubifs_ro_mode(c, err); vfree(c->orph_buf); c->orph_buf = NULL; kfree(c->write_reserve_buf); c->write_reserve_buf = NULL; vfree(c->ileb_buf); c->ileb_buf = NULL; ubifs_lpt_free(c, 1); c->ro_mount = 1; err = dbg_check_space_info(c); if (err) ubifs_ro_mode(c, err); mutex_unlock(&c->umount_mutex); } static void ubifs_put_super(struct super_block *sb) { int i; struct ubifs_info *c = sb->s_fs_info; ubifs_msg(c, "un-mount UBI device %d", c->vi.ubi_num); /* * The following asserts are only valid if there has not been a failure * of the media. For example, there will be dirty inodes if we failed * to write them back because of I/O errors. */ if (!c->ro_error) { ubifs_assert(c, c->bi.idx_growth == 0); ubifs_assert(c, c->bi.dd_growth == 0); ubifs_assert(c, c->bi.data_growth == 0); } /* * The 'c->umount_lock' prevents races between UBIFS memory shrinker * and file system un-mount. Namely, it prevents the shrinker from * picking this superblock for shrinking - it will be just skipped if * the mutex is locked. */ mutex_lock(&c->umount_mutex); if (!c->ro_mount) { /* * First of all kill the background thread to make sure it does * not interfere with un-mounting and freeing resources. */ if (c->bgt) { kthread_stop(c->bgt); c->bgt = NULL; } /* * On fatal errors c->ro_error is set to 1, in which case we do * not write the master node. */ if (!c->ro_error) { int err; /* Synchronize write-buffers */ for (i = 0; i < c->jhead_cnt; i++) { err = ubifs_wbuf_sync(&c->jheads[i].wbuf); if (err) ubifs_ro_mode(c, err); } /* * We are being cleanly unmounted which means the * orphans were killed - indicate this in the master * node. Also save the reserved GC LEB number. */ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); err = ubifs_write_master(c); if (err) /* * Recovery will attempt to fix the master area * next mount, so we just print a message and * continue to unmount normally. */ ubifs_err(c, "failed to write master node, error %d", err); } else { for (i = 0; i < c->jhead_cnt; i++) /* Make sure write-buffer timers are canceled */ hrtimer_cancel(&c->jheads[i].wbuf.timer); } } ubifs_umount(c); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); } static int ubifs_reconfigure(struct fs_context *fc) { struct ubifs_fs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; int err; struct ubifs_info *c = sb->s_fs_info; sync_filesystem(sb); dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags); /* * Apply the mount option changes. * auth_key_name and auth_hash_name are ignored on remount. */ c->mount_opts = ctx->mount_opts; c->bulk_read = ctx->bulk_read; c->no_chk_data_crc = ctx->no_chk_data_crc; c->default_compr = ctx->default_compr; c->assert_action = ctx->assert_action; if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; } if (c->ro_media) { ubifs_msg(c, "cannot re-mount R/W - UBI volume is R/O"); return -EROFS; } err = ubifs_remount_rw(c); if (err) return err; } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; } ubifs_remount_ro(c); } if (c->bulk_read == 1) bu_init(c); else { dbg_gen("disable bulk-read"); mutex_lock(&c->bu_mutex); kfree(c->bu.buf); c->bu.buf = NULL; mutex_unlock(&c->bu_mutex); } if (!c->need_recovery) ubifs_assert(c, c->lst.taken_empty_lebs > 0); return 0; } const struct super_operations ubifs_super_operations = { .alloc_inode = ubifs_alloc_inode, .free_inode = ubifs_free_inode, .put_super = ubifs_put_super, .write_inode = ubifs_write_inode, .drop_inode = ubifs_drop_inode, .evict_inode = ubifs_evict_inode, .statfs = ubifs_statfs, .dirty_inode = ubifs_dirty_inode, .show_options = ubifs_show_options, .sync_fs = ubifs_sync_fs, }; /** * open_ubi - parse UBI device name string and open the UBI device. * @fc: The filesystem context * @mode: UBI volume open mode * * The primary method of mounting UBIFS is by specifying the UBI volume * character device node path. However, UBIFS may also be mounted without any * character device node using one of the following methods: * * o ubiX_Y - mount UBI device number X, volume Y; * o ubiY - mount UBI device number 0, volume Y; * o ubiX:NAME - mount UBI device X, volume with name NAME; * o ubi:NAME - mount UBI device 0, volume with name NAME. * * Alternative '!' separator may be used instead of ':' (because some shells * like busybox may interpret ':' as an NFS host name separator). This function * returns UBI volume description object in case of success and a negative * error code in case of failure. */ static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode) { struct ubi_volume_desc *ubi; const char *name = fc->source; int dev, vol; char *endptr; /* First, try to open using the device node path method */ ubi = ubi_open_volume_path(name, mode); if (!IS_ERR(ubi)) return ubi; /* Try the "nodev" method */ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') goto invalid_source; /* ubi:NAME method */ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') return ubi_open_volume_nm(0, name + 4, mode); if (!isdigit(name[3])) goto invalid_source; dev = simple_strtoul(name + 3, &endptr, 0); /* ubiY method */ if (*endptr == '\0') return ubi_open_volume(0, dev, mode); /* ubiX_Y method */ if (*endptr == '_' && isdigit(endptr[1])) { vol = simple_strtoul(endptr + 1, &endptr, 0); if (*endptr != '\0') goto invalid_source; return ubi_open_volume(dev, vol, mode); } /* ubiX:NAME method */ if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') return ubi_open_volume_nm(dev, ++endptr, mode); invalid_source: return ERR_PTR(invalf(fc, "Invalid source name")); } static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) { struct ubifs_info *c; c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); if (c) { spin_lock_init(&c->cnt_lock); spin_lock_init(&c->cs_lock); spin_lock_init(&c->buds_lock); spin_lock_init(&c->space_lock); spin_lock_init(&c->orphan_lock); init_rwsem(&c->commit_sem); mutex_init(&c->lp_mutex); mutex_init(&c->tnc_mutex); mutex_init(&c->log_mutex); mutex_init(&c->umount_mutex); mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); init_waitqueue_head(&c->reserve_space_wq); atomic_set(&c->need_wait_space, 0); c->buds = RB_ROOT; c->old_idx = RB_ROOT; c->size_tree = RB_ROOT; c->orph_tree = RB_ROOT; INIT_LIST_HEAD(&c->infos_list); INIT_LIST_HEAD(&c->idx_gc); INIT_LIST_HEAD(&c->replay_list); INIT_LIST_HEAD(&c->replay_buds); INIT_LIST_HEAD(&c->uncat_list); INIT_LIST_HEAD(&c->empty_list); INIT_LIST_HEAD(&c->freeable_list); INIT_LIST_HEAD(&c->frdi_idx_list); INIT_LIST_HEAD(&c->unclean_leb_list); INIT_LIST_HEAD(&c->old_buds); INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); c->no_chk_data_crc = 1; c->assert_action = ASSACT_RO; c->highest_inum = UBIFS_FIRST_INO; c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; ubi_get_volume_info(ubi, &c->vi); ubi_get_device_info(c->vi.ubi_num, &c->di); } return c; } static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c = sb->s_fs_info; struct ubifs_fs_context *ctx = fc->fs_private; struct inode *root; int err; c->vfs_sb = sb; /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); goto out; } /* Copy in parsed mount options */ c->mount_opts = ctx->mount_opts; c->auth_key_name = ctx->auth_key_name; c->auth_hash_name = ctx->auth_hash_name; c->no_chk_data_crc = ctx->no_chk_data_crc; c->bulk_read = ctx->bulk_read; c->default_compr = ctx->default_compr; c->assert_action = ctx->assert_action; /* ubifs_info owns auth strings now */ ctx->auth_key_name = NULL; ctx->auth_hash_name = NULL; /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in read_folio, * which means the user would have to wait not just for their own I/O * but the read-ahead I/O as well i.e. completely pointless. * * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also * @sb->s_bdi->capabilities are initialized to 0 so there won't be any * writeback happening. */ err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num, c->vi.vol_id); if (err) goto out_close; sb->s_bdi->ra_pages = 0; sb->s_bdi->io_pages = 0; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; sb->s_xattr = ubifs_xattr_handlers; fscrypt_set_ops(sb, &ubifs_crypt_operations); mutex_lock(&c->umount_mutex); err = mount_ubifs(c); if (err) { ubifs_assert(c, err < 0); goto out_unlock; } /* Read the root inode */ root = ubifs_iget(sb, UBIFS_ROOT_INO); if (IS_ERR(root)) { err = PTR_ERR(root); goto out_umount; } generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; goto out_umount; } super_set_uuid(sb, c->uuid, sizeof(c->uuid)); super_set_sysfs_name_generic(sb, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); mutex_unlock(&c->umount_mutex); return 0; out_umount: ubifs_umount(c); out_unlock: mutex_unlock(&c->umount_mutex); out_close: ubifs_release_options(c); ubi_close_volume(c->ubi); out: return err; } static int sb_test(struct super_block *sb, struct fs_context *fc) { struct ubifs_info *c1 = fc->s_fs_info; struct ubifs_info *c = sb->s_fs_info; return c->vi.cdev == c1->vi.cdev; } static int ubifs_get_tree(struct fs_context *fc) { struct ubi_volume_desc *ubi; struct ubifs_info *c; struct super_block *sb; int err; if (!fc->source || !*fc->source) return invalf(fc, "No source specified"); dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags); /* * Get UBI device number and volume ID. Mount it read-only so far * because this might be a new mount point, and UBI allows only one * read-write user at a time. */ ubi = open_ubi(fc, UBI_READONLY); if (IS_ERR(ubi)) { err = PTR_ERR(ubi); if (!(fc->sb_flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", current->pid, fc->source, err); return err; } c = alloc_ubifs_info(ubi); if (!c) { err = -ENOMEM; goto out_close; } fc->s_fs_info = c; dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); sb = sget_fc(fc, sb_test, set_anon_super_fc); if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); goto out_close; } if (sb->s_root) { struct ubifs_info *c1 = sb->s_fs_info; kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { err = ubifs_fill_super(sb, fc); if (err) goto out_deact; /* We do not support atime */ sb->s_flags |= SB_ACTIVE; if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) ubifs_msg(c, "full atime support is enabled."); else sb->s_flags |= SB_NOATIME; } /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); fc->root = dget(sb->s_root); return 0; out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); return err; } static void kill_ubifs_super(struct super_block *s) { struct ubifs_info *c = s->s_fs_info; kill_anon_super(s); kfree(c); } static void ubifs_free_fc(struct fs_context *fc) { struct ubifs_fs_context *ctx = fc->fs_private; if (ctx) { kfree(ctx->auth_key_name); kfree(ctx->auth_hash_name); kfree(ctx); } } static const struct fs_context_operations ubifs_context_ops = { .free = ubifs_free_fc, .parse_param = ubifs_parse_param, .get_tree = ubifs_get_tree, .reconfigure = ubifs_reconfigure, }; static int ubifs_init_fs_context(struct fs_context *fc) { struct ubifs_fs_context *ctx; ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) { /* Iniitialize for first mount */ ctx->no_chk_data_crc = 1; ctx->assert_action = ASSACT_RO; } else { struct ubifs_info *c = fc->root->d_sb->s_fs_info; /* * Preserve existing options across remounts. * auth_key_name and auth_hash_name are not remountable. */ ctx->mount_opts = c->mount_opts; ctx->bulk_read = c->bulk_read; ctx->no_chk_data_crc = c->no_chk_data_crc; ctx->default_compr = c->default_compr; ctx->assert_action = c->assert_action; } fc->ops = &ubifs_context_ops; fc->fs_private = ctx; return 0; } static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .init_fs_context = ubifs_init_fs_context, .parameters = ubifs_fs_param_spec, .kill_sb = kill_ubifs_super, }; MODULE_ALIAS_FS("ubifs"); /* * Inode slab cache constructor. */ static void inode_slab_ctor(void *obj) { struct ubifs_inode *ui = obj; inode_init_once(&ui->vfs_inode); } static int __init ubifs_init(void) { int err = -ENOMEM; BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); /* Make sure node sizes are 8-byte aligned */ BUILD_BUG_ON(UBIFS_CH_SZ & 7); BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7); BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7); BUILD_BUG_ON(MIN_WRITE_SZ & 7); /* Check min. node size */ BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ); BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ); BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ); /* Defined node sizes */ BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096); BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512); BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160); BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); /* * We use 2 bit wide bit-fields to store compression type, which should * be amended if more compressors are added. The bit-fields are: * @compr_type in 'struct ubifs_inode', @default_compr in * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. */ BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); /* * We require that PAGE_SIZE is greater-than-or-equal-to * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ if (PAGE_SIZE < UBIFS_BLOCK_SIZE) { pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", current->pid, (unsigned int)PAGE_SIZE); return -EINVAL; } ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab"); if (!ubifs_shrinker_info) goto out_slab; ubifs_shrinker_info->count_objects = ubifs_shrink_count; ubifs_shrinker_info->scan_objects = ubifs_shrink_scan; shrinker_register(ubifs_shrinker_info); err = ubifs_compressors_init(); if (err) goto out_shrinker; dbg_debugfs_init(); err = ubifs_sysfs_init(); if (err) goto out_dbg; err = register_filesystem(&ubifs_fs_type); if (err) { pr_err("UBIFS error (pid %d): cannot register file system, error %d", current->pid, err); goto out_sysfs; } return 0; out_sysfs: ubifs_sysfs_exit(); out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); out_shrinker: shrinker_free(ubifs_shrinker_info); out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; } /* late_initcall to let compressors initialize first */ late_initcall(ubifs_init); static void __exit ubifs_exit(void) { WARN_ON(!list_empty(&ubifs_infos)); WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0); dbg_debugfs_exit(); ubifs_sysfs_exit(); ubifs_compressors_exit(); shrinker_free(ubifs_shrinker_info); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ubifs_inode_slab); unregister_filesystem(&ubifs_fs_type); } module_exit(ubifs_exit); MODULE_LICENSE("GPL"); MODULE_VERSION(__stringify(UBIFS_VERSION)); MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); MODULE_DESCRIPTION("UBIFS - UBI File System");
1 1916 2 541 402 102 4 414 415 12 403 415 2 694 1337 26 767 568 129 129 129 35 61 28 54 541 31 1928 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _IP6_FIB_H #define _IP6_FIB_H #include <linux/ipv6_route.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/notifier.h> #include <net/dst.h> #include <net/flow.h> #include <net/ip_fib.h> #include <net/netlink.h> #include <net/inetpeer.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> #include <uapi/linux/bpf.h> #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_HASHSZ 256 #else #define FIB6_TABLE_HASHSZ 1 #endif #define RT6_DEBUG 2 struct rt6_info; struct fib6_info; struct fib6_config { u32 fc_table; u32 fc_metric; int fc_dst_len; int fc_src_len; int fc_ifindex; u32 fc_flags; u32 fc_protocol; u16 fc_type; /* only 8 bits are used */ u16 fc_delete_all_nh : 1, fc_ignore_dev_down:1, __unused : 14; u32 fc_nh_id; struct in6_addr fc_dst; struct in6_addr fc_src; struct in6_addr fc_prefsrc; struct in6_addr fc_gateway; unsigned long fc_expires; struct nlattr *fc_mx; int fc_mx_len; int fc_mp_len; struct nlattr *fc_mp; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; bool fc_is_fdb; }; struct fib6_node { struct fib6_node __rcu *parent; struct fib6_node __rcu *left; struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; struct fib6_gc_args { int timeout; int more; }; #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL static inline bool fib6_routes_require_src(const struct net *net) { return false; } static inline void fib6_routes_require_src_inc(struct net *net) {} static inline void fib6_routes_require_src_dec(struct net *net) {} #else static inline bool fib6_routes_require_src(const struct net *net) { return net->ipv6.fib6_routes_require_src > 0; } static inline void fib6_routes_require_src_inc(struct net *net) { net->ipv6.fib6_routes_require_src++; } static inline void fib6_routes_require_src_dec(struct net *net) { net->ipv6.fib6_routes_require_src--; } #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif /* * routing information * */ struct rt6key { struct in6_addr addr; int plen; }; struct fib6_table; struct rt6_exception_bucket { struct hlist_head chain; int depth; }; struct rt6_exception { struct hlist_node hlist; struct rt6_info *rt6i; unsigned long stamp; struct rcu_head rcu; }; #define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 struct fib6_nh { struct fib_nh_common nh_common; #ifdef CONFIG_IPV6_ROUTER_PREF unsigned long last_probe; #endif struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; }; struct fib6_info { struct fib6_table *fib6_table; struct fib6_info __rcu *fib6_next; struct fib6_node __rcu *fib6_node; /* Multipath routes: * siblings is a list of fib6_info that have the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ union { struct list_head fib6_siblings; struct list_head nh_list; }; unsigned int fib6_nsiblings; refcount_t fib6_ref; unsigned long expires; struct hlist_node gc_link; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct rt6key fib6_dst; u32 fib6_flags; struct rt6key fib6_src; struct rt6key fib6_prefsrc; u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; u8 offload; u8 trap; u8 offload_failed; u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, unused:4; struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; }; struct rt6_info { struct dst_entry dst; struct fib6_info __rcu *from; int sernum; struct rt6key rt6i_dst; struct rt6key rt6i_src; struct in6_addr rt6i_gateway; struct inet6_dev *rt6i_idev; u32 rt6i_flags; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; }; struct fib6_result { struct fib6_nh *nh; struct fib6_info *f6i; u32 fib6_flags; u8 fib6_type; struct rt6_info *rt6; }; #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ rt = rcu_dereference(rt->fib6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ rt = rcu_dereference_protected(rt->fib6_next, 1)) #define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst) static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst) { return dst_rt6_info(dst)->rt6i_idev; } static inline bool fib6_requires_src(const struct fib6_info *rt) { return rt->fib6_src.plen > 0; } /* The callers should hold f6i->fib6_table->tb6_lock if a route has ever * been added to a table before. */ static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } /* The callers should hold f6i->fib6_table->tb6_lock if a route has ever * been added to a table before. */ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; f6i->fib6_flags |= RTF_EXPIRES; } static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); return false; } /* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely * Return false if not */ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, u32 *cookie) { struct fib6_node *fn; bool status = false; fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */ smp_rmb(); status = true; } return status; } static inline u32 rt6_get_cookie(const struct rt6_info *rt) { struct fib6_info *from; u32 cookie = 0; if (rt->sernum) return rt->sernum; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); return cookie; } static inline void ip6_rt_put(struct rt6_info *rt) { /* dst_release() accepts a NULL parameter. * We rely on dst being first structure in struct rt6_info */ BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0); dst_release(&rt->dst); } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh); void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { refcount_inc(&f6i->fib6_ref); } static inline bool fib6_info_hold_safe(struct fib6_info *f6i) { return refcount_inc_not_zero(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu); } } enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, #endif FWS_L, FWS_R, FWS_C, FWS_U }; struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; unsigned int skip_in_node; int (*func)(struct fib6_walker *); void *args; }; struct rt6_statistics { __u32 fib_nodes; /* all fib6 nodes */ __u32 fib_route_nodes; /* intermediate nodes */ __u32 fib_rt_entries; /* rt entries in fib table */ __u32 fib_rt_cache; /* cached rt entries in exception table */ __u32 fib_discarded_routes; /* total number of routes delete */ /* The following stat is not protected by any lock */ atomic_t fib_rt_alloc; /* total number of routes alloced */ }; #define RTN_TL_ROOT 0x0001 #define RTN_ROOT 0x0002 /* tree root node */ #define RTN_RTINFO 0x0004 /* node with valid routing info */ /* * priority levels (or metrics) * */ struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC #define RT6_TABLE_MAIN RT_TABLE_MAIN #define RT6_TABLE_DFLT RT6_TABLE_MAIN #define RT6_TABLE_INFO RT6_TABLE_MAIN #define RT6_TABLE_PREFIX RT6_TABLE_MAIN #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_MIN 1 #define FIB6_TABLE_MAX RT_TABLE_MAX #define RT6_TABLE_LOCAL RT_TABLE_LOCAL #else #define FIB6_TABLE_MIN RT_TABLE_MAIN #define FIB6_TABLE_MAX FIB6_TABLE_MIN #define RT6_TABLE_LOCAL RT6_TABLE_MAIN #endif typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_table *, struct flowi6 *, const struct sk_buff *, int); struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib6_info *rt; unsigned int nsiblings; }; /* * exported functions */ struct fib6_table *fib6_get_table(struct net *net, u32 id); struct fib6_table *fib6_new_table(struct net *net, u32 id); struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); /* called with rcu lock held; can return error pointer * caller needs to select path */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags); /* called with rcu lock held; caller needs to select path */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict); void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict); struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct fib6_info *rt, struct nl_info *info); static inline void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) { const struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) *addr = from->fib6_prefsrc.addr; else *addr = in6addr_any; rcu_read_unlock(); } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); void fib6_nh_release_dsts(struct fib6_nh *fib6_nh); int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack); int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack); int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt); void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); void fib6_gc_cleanup(void); int fib6_init(void); /* Add the route to the gc list if it is not already there * * The callers should hold f6i->fib6_table->tb6_lock. */ static inline void fib6_add_gc_list(struct fib6_info *f6i) { /* If fib6_node is null, the f6i is not in (or removed from) the * table. * * There is a gap between finding the f6i from the table and * calling this function without the protection of the tb6_lock. * This check makes sure the f6i is not added to the gc list when * it is not on the table. */ if (!rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock))) return; if (hlist_unhashed(&f6i->gc_link)) hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist); } /* Remove the route from the gc list if it is on the list. * * The callers should hold f6i->fib6_table->tb6_lock. */ static inline void fib6_remove_gc_list(struct fib6_info *f6i) { if (!hlist_unhashed(&f6i->gc_link)) hlist_del_init(&f6i->gc_link); } struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; loff_t skip; struct fib6_table *tbl; int sernum; }; extern const struct seq_operations ipv6_route_seq_ops; int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(const struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i); void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed); #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct fib6_info *, rt); }; #endif INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup, struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return INDIRECT_CALL_4(lookup, ip6_pol_route_output, ip6_pol_route_input, ip6_pol_route_lookup, __ip6_route_redirect, net, table, fl6, skb, flags); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { return net->ipv6.fib6_has_custom_rules; } int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib6_rules_seq_read(const struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv6.fib6_rules_require_fldissect) return false; memset(flkeys, 0, sizeof(*flkeys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, flkeys, NULL, 0, 0, 0, flag); fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; fl6->flowi6_proto = flkeys->basic.ip_proto; return true; } #else static inline bool fib6_has_custom_rules(const struct net *net) { return false; } static inline int fib6_rules_init(void) { return 0; } static inline void fib6_rules_cleanup(void) { return ; } static inline bool fib6_rule_default(const struct fib_rule *rule) { return true; } static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib6_rules_seq_read(const struct net *net) { return 0; } static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { return false; } #endif #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 /* * net/tipc/ib_media.c: Infiniband bearer support for TIPC * * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> * * Based on eth_media.c, which carries the following copyright notice: * * Copyright (c) 2001-2007, Ericsson AB * Copyright (c) 2005-2008, 2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/if_infiniband.h> #include "core.h" #include "bearer.h" #define TIPC_MAX_IB_LINK_WIN 500 /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) { if (str_size < 60) /* 60 = 19 * strlen("xx:") + strlen("xx\0") */ return 1; sprintf(str_buf, "%20phC", a->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); memcpy(msg, addr->value, INFINIBAND_ALEN); return 0; } /* Convert raw InfiniBand address format to media addr format */ static int tipc_ib_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); memcpy(addr->value, msg, INFINIBAND_ALEN); addr->media_id = TIPC_MEDIA_TYPE_IB; addr->broadcast = !memcmp(msg, b->bcast_addr.value, INFINIBAND_ALEN); return 0; } /* Convert discovery msg addr format to InfiniBand media addr format */ static int tipc_ib_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { return tipc_ib_raw2addr(b, addr, msg); } /* InfiniBand media registration info */ struct tipc_media ib_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_ib_addr2str, .addr2msg = tipc_ib_addr2msg, .msg2addr = tipc_ib_msg2addr, .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" };
33 32 33 33 14 20 14 33 33 20 15 5 20 20 11 9 20 20 8 5 13 12 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 32 8 29 2 33 31 2 32 1 32 30 14 7 4 6 2 5 5 7 7 7 7 7 31 23 32 17 31 25 30 32 32 28 22 30 30 24 32 2 31 24 32 28 28 28 28 33 1 32 32 32 32 28 32 32 31 33 31 33 33 32 33 33 32 33 33 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ /* zstd_decompress_block : * this module takes care of decompressing _compressed_ block */ /*-******************************************************* * Dependencies *********************************************************/ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ #include "../common/compiler.h" /* prefetch */ #include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" #include "../common/huf.h" #include "../common/zstd_internal.h" #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ #include "zstd_decompress_block.h" #include "../common/bits.h" /* ZSTD_highbit32 */ /*_******************************************************* * Macros **********************************************************/ /* These two optional macros force the use one way or another of the two * ZSTD_decompressSequences implementations. You can't force in both directions * at the same time. */ #if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) #error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" #endif /*_******************************************************* * Memory operations **********************************************************/ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } /*-************************************************************* * Block decoding ***************************************************************/ static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) { size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); return blockSizeMax; } /*! ZSTD_getcBlockSize() : * Provides the size of compressed block from block header `src` */ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bpPtr) { RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); { U32 const cBlockHeader = MEM_readLE24(src); U32 const cSize = cBlockHeader >> 3; bpPtr->lastBlock = cBlockHeader & 1; bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); bpPtr->origSize = cSize; /* only useful for RLE */ if (bpPtr->blockType == bt_rle) return 1; RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); return cSize; } } /* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) { size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); assert(litSize <= blockSizeMax); assert(dctx->isFrameDecompression || streaming == not_streaming); assert(expectedWriteSize <= blockSizeMax); if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { /* If we aren't streaming, we can just put the literals after the output * of the current block. We don't need to worry about overwriting the * extDict of our window, because it doesn't exist. * So if we have space after the end of the block, just put it there. */ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; dctx->litBufferEnd = dctx->litBuffer + litSize; dctx->litBufferLocation = ZSTD_in_dst; } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { /* Literals fit entirely within the extra buffer, put them there to avoid * having to split the literals. */ dctx->litBuffer = dctx->litExtraBuffer; dctx->litBufferEnd = dctx->litBuffer + litSize; dctx->litBufferLocation = ZSTD_not_in_dst; } else { assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); /* Literals must be split between the output block and the extra lit * buffer. We fill the extra lit buffer with the tail of the literals, * and put the rest of the literals at the end of the block, with * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. * This MUST not write more than our maxBlockSize beyond dst, because in * streaming mode, that could overwrite part of our extDict window. */ if (splitImmediately) { /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; } else { /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; } dctx->litBufferLocation = ZSTD_split; assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); } } /*! ZSTD_decodeLiteralsBlock() : * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. * * @return : nb of bytes read from src (< srcSize ) * note : symbol not declared but exposed for fullbench */ static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ void* dst, size_t dstCapacity, const streaming_operation streaming) { DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); { const BYTE* const istart = (const BYTE*) src; SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3); size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); switch(litEncType) { case set_repeat: DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); ZSTD_FALLTHROUGH; case set_compressed: RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); { size_t lhSize, litSize, litCSize; U32 singleStream=0; U32 const lhlCode = (istart[0] >> 2) & 3; U32 const lhc = MEM_readLE32(istart); size_t hufSuccess; size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); int const flags = 0 | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); switch(lhlCode) { case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ /* 2 - 2 - 10 - 10 */ singleStream = !lhlCode; lhSize = 3; litSize = (lhc >> 4) & 0x3FF; litCSize = (lhc >> 14) & 0x3FF; break; case 2: /* 2 - 2 - 14 - 14 */ lhSize = 4; litSize = (lhc >> 4) & 0x3FFF; litCSize = lhc >> 18; break; case 3: /* 2 - 2 - 18 - 18 */ lhSize = 5; litSize = (lhc >> 4) & 0x3FFFF; litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); break; } RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); if (!singleStream) RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, "Not enough literals (%zu) for the 4-streams mode (min %u)", litSize, MIN_LITERALS_FOR_4_STREAMS); RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); /* prefetch huffman table if cold */ if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); } if (litEncType==set_repeat) { if (singleStream) { hufSuccess = HUF_decompress1X_usingDTable( dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, flags); } else { assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); hufSuccess = HUF_decompress4X_usingDTable( dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, flags); } } else { if (singleStream) { #if defined(HUF_FORCE_DECOMPRESS_X2) hufSuccess = HUF_decompress1X_DCtx_wksp( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, sizeof(dctx->workspace), flags); #else hufSuccess = HUF_decompress1X1_DCtx_wksp( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, sizeof(dctx->workspace), flags); #endif } else { hufSuccess = HUF_decompress4X_hufOnly_wksp( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, sizeof(dctx->workspace), flags); } } if (dctx->litBufferLocation == ZSTD_split) { assert(litSize > ZSTD_LITBUFFEREXTRASIZE); ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); } RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; dctx->litEntropy = 1; if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; return litCSize + lhSize; } case set_basic: { size_t litSize, lhSize; U32 const lhlCode = ((istart[0]) >> 2) & 3; size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ lhSize = 1; litSize = istart[0] >> 3; break; case 1: lhSize = 2; litSize = MEM_readLE16(istart) >> 4; break; case 3: lhSize = 3; RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); litSize = MEM_readLE24(istart) >> 4; break; } RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); if (dctx->litBufferLocation == ZSTD_split) { ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); } else { ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; return lhSize+litSize; } /* direct reference into compressed stream */ dctx->litPtr = istart+lhSize; dctx->litSize = litSize; dctx->litBufferEnd = dctx->litPtr + litSize; dctx->litBufferLocation = ZSTD_not_in_dst; return lhSize+litSize; } case set_rle: { U32 const lhlCode = ((istart[0]) >> 2) & 3; size_t litSize, lhSize; size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ lhSize = 1; litSize = istart[0] >> 3; break; case 1: lhSize = 2; RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); litSize = MEM_readLE16(istart) >> 4; break; case 3: lhSize = 3; RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); litSize = MEM_readLE24(istart) >> 4; break; } RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); if (dctx->litBufferLocation == ZSTD_split) { ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); } else { ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; return lhSize+1; } default: RETURN_ERROR(corruption_detected, "impossible"); } } } /* Hidden declaration for fullbench */ size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, const void* src, size_t srcSize, void* dst, size_t dstCapacity); size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, const void* src, size_t srcSize, void* dst, size_t dstCapacity) { dctx->isFrameDecompression = 0; return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); } /* Default FSE distribution tables. * These are pre-calculated FSE decoding tables using default distributions as defined in specification : * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions * They were generated programmatically with following method : * - start from default distributions, present in /lib/common/zstd_internal.h * - generate tables normally, using ZSTD_buildFSETable() * - printout the content of tables * - prettify output, report below, test with fuzzer to ensure it's correct */ /* Default FSE distribution table for Literal Lengths */ static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = { { 1, 1, 1, LL_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ /* nextState, nbAddBits, nbBits, baseVal */ { 0, 0, 4, 0}, { 16, 0, 4, 0}, { 32, 0, 5, 1}, { 0, 0, 5, 3}, { 0, 0, 5, 4}, { 0, 0, 5, 6}, { 0, 0, 5, 7}, { 0, 0, 5, 9}, { 0, 0, 5, 10}, { 0, 0, 5, 12}, { 0, 0, 6, 14}, { 0, 1, 5, 16}, { 0, 1, 5, 20}, { 0, 1, 5, 22}, { 0, 2, 5, 28}, { 0, 3, 5, 32}, { 0, 4, 5, 48}, { 32, 6, 5, 64}, { 0, 7, 5, 128}, { 0, 8, 6, 256}, { 0, 10, 6, 1024}, { 0, 12, 6, 4096}, { 32, 0, 4, 0}, { 0, 0, 4, 1}, { 0, 0, 5, 2}, { 32, 0, 5, 4}, { 0, 0, 5, 5}, { 32, 0, 5, 7}, { 0, 0, 5, 8}, { 32, 0, 5, 10}, { 0, 0, 5, 11}, { 0, 0, 6, 13}, { 32, 1, 5, 16}, { 0, 1, 5, 18}, { 32, 1, 5, 22}, { 0, 2, 5, 24}, { 32, 3, 5, 32}, { 0, 3, 5, 40}, { 0, 6, 4, 64}, { 16, 6, 4, 64}, { 32, 7, 5, 128}, { 0, 9, 6, 512}, { 0, 11, 6, 2048}, { 48, 0, 4, 0}, { 16, 0, 4, 1}, { 32, 0, 5, 2}, { 32, 0, 5, 3}, { 32, 0, 5, 5}, { 32, 0, 5, 6}, { 32, 0, 5, 8}, { 32, 0, 5, 9}, { 32, 0, 5, 11}, { 32, 0, 5, 12}, { 0, 0, 6, 15}, { 32, 1, 5, 18}, { 32, 1, 5, 20}, { 32, 2, 5, 24}, { 32, 2, 5, 28}, { 32, 3, 5, 40}, { 32, 4, 5, 48}, { 0, 16, 6,65536}, { 0, 15, 6,32768}, { 0, 14, 6,16384}, { 0, 13, 6, 8192}, }; /* LL_defaultDTable */ /* Default FSE distribution table for Offset Codes */ static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = { { 1, 1, 1, OF_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ /* nextState, nbAddBits, nbBits, baseVal */ { 0, 0, 5, 0}, { 0, 6, 4, 61}, { 0, 9, 5, 509}, { 0, 15, 5,32765}, { 0, 21, 5,2097149}, { 0, 3, 5, 5}, { 0, 7, 4, 125}, { 0, 12, 5, 4093}, { 0, 18, 5,262141}, { 0, 23, 5,8388605}, { 0, 5, 5, 29}, { 0, 8, 4, 253}, { 0, 14, 5,16381}, { 0, 20, 5,1048573}, { 0, 2, 5, 1}, { 16, 7, 4, 125}, { 0, 11, 5, 2045}, { 0, 17, 5,131069}, { 0, 22, 5,4194301}, { 0, 4, 5, 13}, { 16, 8, 4, 253}, { 0, 13, 5, 8189}, { 0, 19, 5,524285}, { 0, 1, 5, 1}, { 16, 6, 4, 61}, { 0, 10, 5, 1021}, { 0, 16, 5,65533}, { 0, 28, 5,268435453}, { 0, 27, 5,134217725}, { 0, 26, 5,67108861}, { 0, 25, 5,33554429}, { 0, 24, 5,16777213}, }; /* OF_defaultDTable */ /* Default FSE distribution table for Match Lengths */ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = { { 1, 1, 1, ML_DEFAULTNORMLOG}, /* header : fastMode, tableLog */ /* nextState, nbAddBits, nbBits, baseVal */ { 0, 0, 6, 3}, { 0, 0, 4, 4}, { 32, 0, 5, 5}, { 0, 0, 5, 6}, { 0, 0, 5, 8}, { 0, 0, 5, 9}, { 0, 0, 5, 11}, { 0, 0, 6, 13}, { 0, 0, 6, 16}, { 0, 0, 6, 19}, { 0, 0, 6, 22}, { 0, 0, 6, 25}, { 0, 0, 6, 28}, { 0, 0, 6, 31}, { 0, 0, 6, 34}, { 0, 1, 6, 37}, { 0, 1, 6, 41}, { 0, 2, 6, 47}, { 0, 3, 6, 59}, { 0, 4, 6, 83}, { 0, 7, 6, 131}, { 0, 9, 6, 515}, { 16, 0, 4, 4}, { 0, 0, 4, 5}, { 32, 0, 5, 6}, { 0, 0, 5, 7}, { 32, 0, 5, 9}, { 0, 0, 5, 10}, { 0, 0, 6, 12}, { 0, 0, 6, 15}, { 0, 0, 6, 18}, { 0, 0, 6, 21}, { 0, 0, 6, 24}, { 0, 0, 6, 27}, { 0, 0, 6, 30}, { 0, 0, 6, 33}, { 0, 1, 6, 35}, { 0, 1, 6, 39}, { 0, 2, 6, 43}, { 0, 3, 6, 51}, { 0, 4, 6, 67}, { 0, 5, 6, 99}, { 0, 8, 6, 259}, { 32, 0, 4, 4}, { 48, 0, 4, 4}, { 16, 0, 4, 5}, { 32, 0, 5, 7}, { 32, 0, 5, 8}, { 32, 0, 5, 10}, { 32, 0, 5, 11}, { 0, 0, 6, 14}, { 0, 0, 6, 17}, { 0, 0, 6, 20}, { 0, 0, 6, 23}, { 0, 0, 6, 26}, { 0, 0, 6, 29}, { 0, 0, 6, 32}, { 0, 16, 6,65539}, { 0, 15, 6,32771}, { 0, 14, 6,16387}, { 0, 13, 6, 8195}, { 0, 12, 6, 4099}, { 0, 11, 6, 2051}, { 0, 10, 6, 1027}, }; /* ML_defaultDTable */ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) { void* ptr = dt; ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; ZSTD_seqSymbol* const cell = dt + 1; DTableH->tableLog = 0; DTableH->fastMode = 0; cell->nbBits = 0; cell->nextState = 0; assert(nbAddBits < 255); cell->nbAdditionalBits = nbAddBits; cell->baseValue = baseValue; } /* ZSTD_buildFSETable() : * generate FSE decoding table for one symbol (ll, ml or off) * cannot fail if input is valid => * all inputs are presumed validated at this stage */ FORCE_INLINE_TEMPLATE void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_seqSymbol* const tableDecode = dt+1; U32 const maxSV1 = maxSymbolValue + 1; U32 const tableSize = 1 << tableLog; U16* symbolNext = (U16*)wksp; BYTE* spread = (BYTE*)(symbolNext + MaxSeq + 1); U32 highThreshold = tableSize - 1; /* Sanity Checks */ assert(maxSymbolValue <= MaxSeq); assert(tableLog <= MaxFSELog); assert(wkspSize >= ZSTD_BUILD_FSE_TABLE_WKSP_SIZE); (void)wkspSize; /* Init, lay down lowprob symbols */ { ZSTD_seqSymbol_header DTableH; DTableH.tableLog = tableLog; DTableH.fastMode = 1; { S16 const largeLimit= (S16)(1 << (tableLog-1)); U32 s; for (s=0; s<maxSV1; s++) { if (normalizedCounter[s]==-1) { tableDecode[highThreshold--].baseValue = s; symbolNext[s] = 1; } else { if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; assert(normalizedCounter[s]>=0); symbolNext[s] = (U16)normalizedCounter[s]; } } } ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); } /* Spread symbols */ assert(tableSize <= 512); /* Specialized symbol spreading for the case when there are * no low probability (-1 count) symbols. When compressing * small blocks we avoid low probability symbols to hit this * case, since header decoding speed matters more. */ if (highThreshold == tableSize - 1) { size_t const tableMask = tableSize-1; size_t const step = FSE_TABLESTEP(tableSize); /* First lay down the symbols in order. * We use a uint64_t to lay down 8 bytes at a time. This reduces branch * misses since small blocks generally have small table logs, so nearly * all symbols have counts <= 8. We ensure we have 8 bytes at the end of * our buffer to handle the over-write. */ { U64 const add = 0x0101010101010101ull; size_t pos = 0; U64 sv = 0; U32 s; for (s=0; s<maxSV1; ++s, sv += add) { int i; int const n = normalizedCounter[s]; MEM_write64(spread + pos, sv); for (i = 8; i < n; i += 8) { MEM_write64(spread + pos + i, sv); } assert(n>=0); pos += (size_t)n; } } /* Now we spread those positions across the table. * The benefit of doing it in two stages is that we avoid the * variable size inner loop, which caused lots of branch misses. * Now we can run through all the positions without any branch misses. * We unroll the loop twice, since that is what empirically worked best. */ { size_t position = 0; size_t s; size_t const unroll = 2; assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */ for (s = 0; s < (size_t)tableSize; s += unroll) { size_t u; for (u = 0; u < unroll; ++u) { size_t const uPosition = (position + (u * step)) & tableMask; tableDecode[uPosition].baseValue = spread[s + u]; } position = (position + (unroll * step)) & tableMask; } assert(position == 0); } } else { U32 const tableMask = tableSize-1; U32 const step = FSE_TABLESTEP(tableSize); U32 s, position = 0; for (s=0; s<maxSV1; s++) { int i; int const n = normalizedCounter[s]; for (i=0; i<n; i++) { tableDecode[position].baseValue = s; position = (position + step) & tableMask; while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ } } assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ } /* Build Decoding table */ { U32 u; for (u=0; u<tableSize; u++) { U32 const symbol = tableDecode[u].baseValue; U32 const nextState = symbolNext[symbol]++; tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) ); tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); assert(nbAdditionalBits[symbol] < 255); tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; tableDecode[u].baseValue = baseValue[symbol]; } } } /* Avoids the FORCE_INLINE of the _body() function. */ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); } #if DYNAMIC_BMI2 BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); } #endif void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) { #if DYNAMIC_BMI2 if (bmi2) { ZSTD_buildFSETable_body_bmi2(dt, normalizedCounter, maxSymbolValue, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); return; } #endif (void)bmi2; ZSTD_buildFSETable_body_default(dt, normalizedCounter, maxSymbolValue, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize); } /*! ZSTD_buildSeqTable() : * @return : nb bytes read from src, * or an error code if it fails */ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, SymbolEncodingType_e type, unsigned max, U32 maxLog, const void* src, size_t srcSize, const U32* baseValue, const U8* nbAdditionalBits, const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, int bmi2) { switch(type) { case set_rle : RETURN_ERROR_IF(!srcSize, srcSize_wrong, ""); RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); { U32 const symbol = *(const BYTE*)src; U32 const baseline = baseValue[symbol]; U8 const nbBits = nbAdditionalBits[symbol]; ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); } *DTablePtr = DTableSpace; return 1; case set_basic : *DTablePtr = defaultTable; return 0; case set_repeat: RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); /* prefetch FSE table if used */ if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { const void* const pStart = *DTablePtr; size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); PREFETCH_AREA(pStart, pSize); } return 0; case set_compressed : { unsigned tableLog; S16 norm[MaxSeq+1]; size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog, wksp, wkspSize, bmi2); *DTablePtr = DTableSpace; return headerSize; } default : assert(0); RETURN_ERROR(GENERIC, "impossible"); } } size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize) { const BYTE* const istart = (const BYTE*)src; const BYTE* const iend = istart + srcSize; const BYTE* ip = istart; int nbSeq; DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); /* check */ RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); /* SeqHead */ nbSeq = *ip++; if (nbSeq > 0x7F) { if (nbSeq == 0xFF) { RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); nbSeq = MEM_readLE16(ip) + LONGNBSEQ; ip+=2; } else { RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); nbSeq = ((nbSeq-0x80)<<8) + *ip++; } } *nbSeqPtr = nbSeq; if (nbSeq == 0) { /* No sequence : section ends immediately */ RETURN_ERROR_IF(ip != iend, corruption_detected, "extraneous data present in the Sequences section"); return (size_t)(ip - istart); } /* FSE table descriptors */ RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ { SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6); SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3); SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3); ip++; /* Build DTables */ { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend-ip, LL_base, LL_bits, LL_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += llhSize; } { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend-ip, OF_base, OF_bits, OF_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += ofhSize; } { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend-ip, ML_base, ML_bits, ML_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += mlhSize; } } return ip-istart; } typedef struct { size_t litLength; size_t matchLength; size_t offset; } seq_t; typedef struct { size_t state; const ZSTD_seqSymbol* table; } ZSTD_fseState; typedef struct { BIT_DStream_t DStream; ZSTD_fseState stateLL; ZSTD_fseState stateOffb; ZSTD_fseState stateML; size_t prevOffset[ZSTD_REP_NUM]; } seqState_t; /*! ZSTD_overlapCopy8() : * Copies 8 bytes from ip to op and updates op and ip where ip <= op. * If the offset is < 8 then the offset is spread to at least 8 bytes. * * Precondition: *ip <= *op * Postcondition: *op - *op >= 8 */ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { assert(*ip <= *op); if (offset < 8) { /* close range match, overlap */ static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ int const sub2 = dec64table[offset]; (*op)[0] = (*ip)[0]; (*op)[1] = (*ip)[1]; (*op)[2] = (*ip)[2]; (*op)[3] = (*ip)[3]; *ip += dec32table[offset]; ZSTD_copy4(*op+4, *ip); *ip -= sub2; } else { ZSTD_copy8(*op, *ip); } *ip += 8; *op += 8; assert(*op - *ip >= 8); } /*! ZSTD_safecopy() : * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer * and write up to 16 bytes past oend_w (op >= oend_w is allowed). * This function is only called in the uncommon case where the sequence is near the end of the block. It * should be fast for a single long sequence, but can be slow for several short sequences. * * @param ovtype controls the overlap detection * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. * The src buffer must be before the dst buffer. */ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { ptrdiff_t const diff = op - ip; BYTE* const oend = op + length; assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); if (length < 8) { /* Handle short lengths. */ while (op < oend) *op++ = *ip++; return; } if (ovtype == ZSTD_overlap_src_before_dst) { /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ assert(length >= 8); ZSTD_overlapCopy8(&op, &ip, diff); length -= 8; assert(op - ip >= 8); assert(op <= oend); } if (oend <= oend_w) { /* No risk of overwrite. */ ZSTD_wildcopy(op, ip, length, ovtype); return; } if (op <= oend_w) { /* Wildcopy until we get close to the end. */ assert(oend > oend_w); ZSTD_wildcopy(op, ip, oend_w - op, ovtype); ip += oend_w - op; op += oend_w - op; } /* Handle the leftovers. */ while (op < oend) *op++ = *ip++; } /* ZSTD_safecopyDstBeforeSrc(): * This version allows overlap with dst before src, or handles the non-overlap case with dst after src * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { ptrdiff_t const diff = op - ip; BYTE* const oend = op + length; if (length < 8 || diff > -8) { /* Handle short lengths, close overlaps, and dst not before src. */ while (op < oend) *op++ = *ip++; return; } if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); ip += oend - WILDCOPY_OVERLENGTH - op; op += oend - WILDCOPY_OVERLENGTH - op; } /* Handle the leftovers. */ while (op < oend) *op++ = *ip++; } /* ZSTD_execSequenceEnd(): * This version handles cases that are near the end of the output buffer. It requires * more careful checks to make sure there is no overflow. By separating out these hard * and unlikely cases, we can speed up the common cases. * * NOTE: This function needs to be fast for a single long sequence, but doesn't need * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). */ FORCE_NOINLINE ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequenceEnd(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* bounds checks : careful of address space overflow in 32-bit mode */ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); assert(op < op + sequenceLength); assert(oLitEnd < op + sequenceLength); /* copy literals */ ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); op = oLitEnd; *litPtr = iLitEnd; /* copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); match = dictEnd - (prefixStart - match); if (match + sequence.matchLength <= dictEnd) { ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; ZSTD_memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; match = prefixStart; } } ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); return sequenceLength; } /* ZSTD_execSequenceEndSplitLitBuffer(): * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. */ FORCE_NOINLINE ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, BYTE* const oend, const BYTE* const oend_w, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; /* bounds checks : careful of address space overflow in 32-bit mode */ RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); assert(op < op + sequenceLength); assert(oLitEnd < op + sequenceLength); /* copy literals */ RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); op = oLitEnd; *litPtr = iLitEnd; /* copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); match = dictEnd - (prefixStart - match); if (match + sequence.matchLength <= dictEnd) { ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; ZSTD_memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; match = prefixStart; } } ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); return sequenceLength; } HINT_INLINE ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequence(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; assert(op != NULL /* Precondition */); assert(oend_w < oend /* No underflow */); #if defined(__aarch64__) /* prefetch sequence starting from match that will be used for copy later */ PREFETCH_L1(match); #endif /* Handle edge cases in a slow path: * - Read beyond end of literals * - Match end is within WILDCOPY_OVERLIMIT of oend * - 32-bit mode and the match length overflows */ if (UNLIKELY( iLitEnd > litLimit || oMatchEnd > oend_w || (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ assert(op <= oLitEnd /* No overflow */); assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); assert(oMatchEnd <= oend /* No underflow */); assert(iLitEnd <= litLimit /* Literal length is in bounds */); assert(oLitEnd <= oend_w /* Can wildcopy literals */); assert(oMatchEnd <= oend_w /* Can wildcopy matches */); /* Copy Literals: * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. * We likely don't need the full 32-byte wildcopy. */ assert(WILDCOPY_OVERLENGTH >= 16); ZSTD_copy16(op, (*litPtr)); if (UNLIKELY(sequence.litLength > 16)) { ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); } op = oLitEnd; *litPtr = iLitEnd; /* update for next sequence */ /* Copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) { ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; ZSTD_memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; match = prefixStart; } } /* Match within prefix of 1 or more bytes */ assert(op <= oMatchEnd); assert(oMatchEnd <= oend_w); assert(match >= prefixStart); assert(sequence.matchLength >= 1); /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking. */ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { /* We bet on a full wildcopy for matches, since we expect matches to be * longer than literals (in general). In silesia, ~10% of matches are longer * than 16 bytes. */ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); return sequenceLength; } assert(sequence.offset < WILDCOPY_VECLEN); /* Copy 8 bytes and spread the offset to be >= 8. */ ZSTD_overlapCopy8(&op, &match, sequence.offset); /* If the match length is > 8 bytes, then continue with the wildcopy. */ if (sequence.matchLength > 8) { assert(op < oMatchEnd); ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); } return sequenceLength; } HINT_INLINE ZSTD_ALLOW_POINTER_OVERFLOW_ATTR size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, BYTE* const oend, const BYTE* const oend_w, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ const BYTE* const iLitEnd = *litPtr + sequence.litLength; const BYTE* match = oLitEnd - sequence.offset; assert(op != NULL /* Precondition */); assert(oend_w < oend /* No underflow */); /* Handle edge cases in a slow path: * - Read beyond end of literals * - Match end is within WILDCOPY_OVERLIMIT of oend * - 32-bit mode and the match length overflows */ if (UNLIKELY( iLitEnd > litLimit || oMatchEnd > oend_w || (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ assert(op <= oLitEnd /* No overflow */); assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); assert(oMatchEnd <= oend /* No underflow */); assert(iLitEnd <= litLimit /* Literal length is in bounds */); assert(oLitEnd <= oend_w /* Can wildcopy literals */); assert(oMatchEnd <= oend_w /* Can wildcopy matches */); /* Copy Literals: * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. * We likely don't need the full 32-byte wildcopy. */ assert(WILDCOPY_OVERLENGTH >= 16); ZSTD_copy16(op, (*litPtr)); if (UNLIKELY(sequence.litLength > 16)) { ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); } op = oLitEnd; *litPtr = iLitEnd; /* update for next sequence */ /* Copy Match */ if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix -> go into extDict */ RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); match = dictEnd + (match - prefixStart); if (match + sequence.matchLength <= dictEnd) { ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; ZSTD_memmove(oLitEnd, match, length1); op = oLitEnd + length1; sequence.matchLength -= length1; match = prefixStart; } } /* Match within prefix of 1 or more bytes */ assert(op <= oMatchEnd); assert(oMatchEnd <= oend_w); assert(match >= prefixStart); assert(sequence.matchLength >= 1); /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking. */ if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { /* We bet on a full wildcopy for matches, since we expect matches to be * longer than literals (in general). In silesia, ~10% of matches are longer * than 16 bytes. */ ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); return sequenceLength; } assert(sequence.offset < WILDCOPY_VECLEN); /* Copy 8 bytes and spread the offset to be >= 8. */ ZSTD_overlapCopy8(&op, &match, sequence.offset); /* If the match length is > 8 bytes, then continue with the wildcopy. */ if (sequence.matchLength > 8) { assert(op < oMatchEnd); ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); } return sequenceLength; } static void ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) { const void* ptr = dt; const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", (U32)DStatePtr->state, DTableH->tableLog); BIT_reloadDStream(bitD); DStatePtr->table = dt + 1; } FORCE_INLINE_TEMPLATE void ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) { size_t const lowBits = BIT_readBits(bitD, nbBits); DStatePtr->state = nextState + lowBits; } /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 * bits before reloading. This value is the maximum number of bytes we read * after reloading when we are decoding long offsets. */ #define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ : 0) typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; /* * ZSTD_decodeSequence(): * @p longOffsets : tells the decoder to reload more bit while decoding large offsets * only used in 32-bit mode * @return : Sequence (litL + matchL + offset) */ FORCE_INLINE_TEMPLATE seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) { seq_t seq; /* * ZSTD_seqSymbol is a 64 bits wide structure. * It can be loaded in one operation * and its fields extracted by simply shifting or bit-extracting on aarch64. * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh * operations that cause performance drop. This can be avoided by using this * ZSTD_memcpy hack. */ #if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; ZSTD_seqSymbol* const llDInfo = &llDInfoS; ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); #else const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; #endif seq.matchLength = mlDInfo->baseValue; seq.litLength = llDInfo->baseValue; { U32 const ofBase = ofDInfo->baseValue; BYTE const llBits = llDInfo->nbAdditionalBits; BYTE const mlBits = mlDInfo->nbAdditionalBits; BYTE const ofBits = ofDInfo->nbAdditionalBits; BYTE const totalBits = llBits+mlBits+ofBits; U16 const llNext = llDInfo->nextState; U16 const mlNext = mlDInfo->nextState; U16 const ofNext = ofDInfo->nextState; U32 const llnbBits = llDInfo->nbBits; U32 const mlnbBits = mlDInfo->nbBits; U32 const ofnbBits = ofDInfo->nbBits; assert(llBits <= MaxLLBits); assert(mlBits <= MaxMLBits); assert(ofBits <= MaxOff); /* * As gcc has better branch and block analyzers, sometimes it is only * valuable to mark likeliness for clang, it gives around 3-4% of * performance. */ /* sequence */ { size_t offset; if (ofBits > 1) { ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { /* Always read extra bits, this keeps the logic simple, * avoids branches, and avoids accidentally reading 0 bits. */ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); BIT_reloadDStream(&seqState->DStream); offset += BIT_readBitsFast(&seqState->DStream, extraBits); } else { offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } seqState->prevOffset[2] = seqState->prevOffset[1]; seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset; } else { U32 const ll0 = (llDInfo->baseValue == 0); if (LIKELY((ofBits == 0))) { offset = seqState->prevOffset[ll0]; seqState->prevOffset[1] = seqState->prevOffset[!ll0]; seqState->prevOffset[0] = offset; } else { offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; seqState->prevOffset[1] = seqState->prevOffset[0]; seqState->prevOffset[0] = offset = temp; } } } seq.offset = offset; } if (mlBits > 0) seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) BIT_reloadDStream(&seqState->DStream); if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) BIT_reloadDStream(&seqState->DStream); /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); if (llBits > 0) seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); if (!isLastSeq) { /* don't update FSE state for last Sequence */ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ BIT_reloadDStream(&seqState->DStream); } } return seq; } #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) #if DEBUGLEVEL >= 1 static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) { size_t const windowSize = dctx->fParams.windowSize; /* No dictionary used. */ if (dctx->dictContentEndForFuzzing == NULL) return 0; /* Dictionary is our prefix. */ if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; /* Dictionary is not our ext-dict. */ if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; /* Dictionary is not within our window size. */ if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; /* Dictionary is active. */ return 1; } #endif static void ZSTD_assertValidSequence( ZSTD_DCtx const* dctx, BYTE const* op, BYTE const* oend, seq_t const seq, BYTE const* prefixStart, BYTE const* virtualStart) { #if DEBUGLEVEL >= 1 if (dctx->isFrameDecompression) { size_t const windowSize = dctx->fParams.windowSize; size_t const sequenceSize = seq.litLength + seq.matchLength; BYTE const* const oLitEnd = op + seq.litLength; DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); assert(op <= oend); assert((size_t)(oend - op) >= sequenceSize); assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); /* Offset must be within the dictionary. */ assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); assert(seq.offset <= windowSize + dictSize); } else { /* Offset must be within our window. */ assert(seq.offset <= windowSize); } } #else (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; #endif } #endif #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG FORCE_INLINE_TEMPLATE size_t DONT_VECTORIZE ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); /* Literals are split between internal buffer & output buffer */ if (nbSeq) { seqState_t seqState; dctx->fseEntropy = 1; { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } RETURN_ERROR_IF( ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), corruption_detected, ""); ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); assert(dst != NULL); ZSTD_STATIC_ASSERT( BIT_DStream_unfinished < BIT_DStream_completed && BIT_DStream_endOfBuffer < BIT_DStream_completed && BIT_DStream_completed < BIT_DStream_overflow); /* decompress without overrunning litPtr begins */ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ /* Align the decompression loop to 32 + 16 bytes. * * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression * speed swings based on the alignment of the decompression loop. This * performance swing is caused by parts of the decompression loop falling * out of the DSB. The entire decompression loop should fit in the DSB, * when it can't we get much worse performance. You can measure if you've * hit the good case or the bad case with this perf command for some * compressed file test.zst: * * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst * * If you see most cycles served out of the MITE you've hit the bad case. * If you see most cycles served out of the DSB you've hit the good case. * If it is pretty even then you may be in an okay case. * * This issue has been reproduced on the following CPUs: * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 * Use Instruments->Counters to get DSB/MITE cycles. * I never got performance swings, but I was able to * go from the good case of mostly DSB to half of the * cycles served from MITE. * - Coffeelake: Intel i9-9900k * - Coffeelake: Intel i7-9700k * * I haven't been able to reproduce the instability or DSB misses on any * of the following CPUS: * - Haswell * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH * - Skylake * * Alignment is done for each of the three major decompression loops: * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer * - ZSTD_decompressSequences_body * Alignment choices are made to minimize large swings on bad cases and influence on performance * from changes external to this code, rather than to overoptimize on the current commit. * * If you are seeing performance stability this script can help test. * It tests on 4 commits in zstd where I saw performance change. * * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 */ #if defined(__x86_64__) __asm__(".p2align 6"); # if __GNUC__ >= 7 /* good for gcc-7, gcc-9, and gcc-11 */ __asm__("nop"); __asm__(".p2align 5"); __asm__("nop"); __asm__(".p2align 4"); # if __GNUC__ == 8 || __GNUC__ == 10 /* good for gcc-8 and gcc-10 */ __asm__("nop"); __asm__(".p2align 3"); # endif # endif #endif /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ for ( ; nbSeq; nbSeq--) { sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); if (litPtr + sequence.litLength > dctx->litBufferEnd) break; { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); op += oneSeqSize; } } DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ if (nbSeq > 0) { const size_t leftoverLit = dctx->litBufferEnd - litPtr; DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); if (leftoverLit) { RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); sequence.litLength -= leftoverLit; op += leftoverLit; } litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); op += oneSeqSize; } nbSeq--; } } if (nbSeq > 0) { /* there is remaining lit from extra buffer */ #if defined(__x86_64__) __asm__(".p2align 6"); __asm__("nop"); # if __GNUC__ != 7 /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ __asm__(".p2align 4"); __asm__("nop"); __asm__(".p2align 3"); # elif __GNUC__ >= 11 __asm__(".p2align 3"); # else __asm__(".p2align 5"); __asm__("nop"); __asm__(".p2align 3"); # endif #endif for ( ; nbSeq ; nbSeq--) { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); op += oneSeqSize; } } /* check if reached exact end */ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); RETURN_ERROR_IF(nbSeq, corruption_detected, ""); DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); /* save reps for next block */ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } } /* last literal segment */ if (dctx->litBufferLocation == ZSTD_split) { /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memmove(op, litPtr, lastLLSize); op += lastLLSize; } litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; } /* copy last literals from internal buffer */ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memcpy(op, litPtr, lastLLSize); op += lastLLSize; } } DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); return (size_t)(op - ostart); } FORCE_INLINE_TEMPLATE size_t DONT_VECTORIZE ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* const litEnd = litPtr + dctx->litSize; const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); /* Regen sequences */ if (nbSeq) { seqState_t seqState; dctx->fseEntropy = 1; { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } RETURN_ERROR_IF( ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), corruption_detected, ""); ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); assert(dst != NULL); #if defined(__x86_64__) __asm__(".p2align 6"); __asm__("nop"); # if __GNUC__ >= 7 __asm__(".p2align 5"); __asm__("nop"); __asm__(".p2align 3"); # else __asm__(".p2align 4"); __asm__("nop"); __asm__(".p2align 3"); # endif #endif for ( ; nbSeq ; nbSeq--) { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); #endif if (UNLIKELY(ZSTD_isError(oneSeqSize))) return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); op += oneSeqSize; } /* check if reached exact end */ assert(nbSeq == 0); RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); /* save reps for next block */ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } } /* last literal segment */ { size_t const lastLLSize = (size_t)(litEnd - litPtr); DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memcpy(op, litPtr, lastLLSize); op += lastLLSize; } } DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); return (size_t)(op - ostart); } static size_t ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } static size_t ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT FORCE_INLINE_TEMPLATE size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, const BYTE* const prefixStart, const BYTE* const dictEnd) { prefetchPos += sequence.litLength; { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. * No consequence though : memory address is only used for prefetching, not for dereferencing */ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ } return prefetchPos + sequence.matchLength; } /* This decoding function employs prefetching * to reduce latency impact of cache misses. * It's generally employed when block contains a significant portion of long-distance matches * or when coupled with a "cold" dictionary */ FORCE_INLINE_TEMPLATE size_t ZSTD_decompressSequencesLong_body( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; BYTE* const ostart = (BYTE*)dst; BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); /* Regen sequences */ if (nbSeq) { #define STORED_SEQS 8 #define STORED_SEQS_MASK (STORED_SEQS-1) #define ADVANCED_SEQS STORED_SEQS seq_t sequences[STORED_SEQS]; int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); seqState_t seqState; int seqNb; size_t prefetchPos = (size_t)(op-prefixStart); /* track position relative to prefixStart */ dctx->fseEntropy = 1; { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } assert(dst != NULL); assert(iend >= ip); RETURN_ERROR_IF( ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), corruption_detected, ""); ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); /* prepare in advance */ for (seqNb=0; seqNb<seqAdvance; seqNb++) { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); sequences[seqNb] = sequence; } /* decompress without stomping litBuffer */ for (; seqNb < nbSeq; seqNb++) { seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1); if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ const size_t leftoverLit = dctx->litBufferEnd - litPtr; if (leftoverLit) { RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; op += leftoverLit; } litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); #endif if (ZSTD_isError(oneSeqSize)) return oneSeqSize; prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); sequences[seqNb & STORED_SEQS_MASK] = sequence; op += oneSeqSize; } } else { /* lit buffer is either wholly contained in first or second split, or not split at all*/ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); #endif if (ZSTD_isError(oneSeqSize)) return oneSeqSize; prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); sequences[seqNb & STORED_SEQS_MASK] = sequence; op += oneSeqSize; } } RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); /* finish queue */ seqNb -= seqAdvance; for ( ; seqNb<nbSeq ; seqNb++) { seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { const size_t leftoverLit = dctx->litBufferEnd - litPtr; if (leftoverLit) { RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); sequence->litLength -= leftoverLit; op += leftoverLit; } litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; dctx->litBufferLocation = ZSTD_not_in_dst; { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); #endif if (ZSTD_isError(oneSeqSize)) return oneSeqSize; op += oneSeqSize; } } else { size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); #endif if (ZSTD_isError(oneSeqSize)) return oneSeqSize; op += oneSeqSize; } } /* save reps for next block */ { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } } /* last literal segment */ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ size_t const lastLLSize = litBufferEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memmove(op, litPtr, lastLLSize); op += lastLLSize; } litPtr = dctx->litExtraBuffer; litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; } { size_t const lastLLSize = litBufferEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { ZSTD_memmove(op, litPtr, lastLLSize); op += lastLLSize; } } return (size_t)(op - ostart); } static size_t ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ #if DYNAMIC_BMI2 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG static BMI2_TARGET_ATTRIBUTE size_t DONT_VECTORIZE ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } static BMI2_TARGET_ATTRIBUTE size_t DONT_VECTORIZE ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT static BMI2_TARGET_ATTRIBUTE size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ #endif /* DYNAMIC_BMI2 */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG static size_t ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { DEBUGLOG(5, "ZSTD_decompressSequences"); #if DYNAMIC_BMI2 if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } static size_t ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); #if DYNAMIC_BMI2 if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT /* ZSTD_decompressSequencesLong() : * decompression function triggered when a minimum share of offsets is considered "long", * aka out of cache. * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". * This function will try to mitigate main memory latency through the use of prefetching */ static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset) { DEBUGLOG(5, "ZSTD_decompressSequencesLong"); #if DYNAMIC_BMI2 if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ /* * @returns The total size of the history referenceable by zstd, including * both the prefix and the extDict. At @p op any offset larger than this * is invalid. */ static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) { return (size_t)(op - virtualStart); } typedef struct { unsigned longOffsetShare; unsigned maxNbAdditionalBits; } ZSTD_OffsetInfo; /* ZSTD_getOffsetInfo() : * condition : offTable must be valid * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) * compared to maximum possible of (1<<OffFSELog), * as well as the maximum number additional bits required. */ static ZSTD_OffsetInfo ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq) { ZSTD_OffsetInfo info = {0, 0}; /* If nbSeq == 0, then the offTable is uninitialized, but we have * no sequences, so both values should be 0. */ if (nbSeq != 0) { const void* ptr = offTable; U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; const ZSTD_seqSymbol* table = offTable + 1; U32 const max = 1 << tableLog; U32 u; DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); assert(max <= (1 << OffFSELog)); /* max not too large */ for (u=0; u<max; u++) { info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits); if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1; } assert(tableLog <= OffFSELog); info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ } return info; } /* * @returns The maximum offset we can decode in one read of our bitstream, without * reloading more bits in the middle of the offset bits read. Any offsets larger * than this must use the long offset decoder. */ static size_t ZSTD_maxShortOffset(void) { if (MEM_64bits()) { /* We can decode any offset without reloading bits. * This might change if the max window size grows. */ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); return (size_t)-1; } else { /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. */ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); return maxOffset; } } size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const streaming_operation streaming) { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); /* Note : the wording of the specification * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). * This generally does not happen, as it makes little sense, * since an uncompressed block would feature same size and have no decompression cost. * Also, note that decoder from reference libzstd before < v1.5.4 * would consider this edge case as an error. * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) * for broader compatibility with the deployed ecosystem of zstd decoders */ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); /* Decode literals section */ { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; srcSize -= litCSize; } /* Build Decoding Tables */ { /* Compute the maximum block size, which must also work when !frame and fParams are unset. * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. */ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); /* isLongOffset must be true if there are long offsets. * Offsets are long if they are larger than ZSTD_maxShortOffset(). * We don't expect that to be the case in 64-bit mode. * * We check here to see if our history is large enough to allow long offsets. * If it isn't, then we can't possible have (valid) long offsets. If the offset * is invalid, then it is okay to read it incorrectly. * * If isLongOffsets is true, then we will later check our decoding table to see * if it is even possible to generate long offsets. */ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); /* These macros control at build-time which decompressor implementation * we use. If neither is defined, we do some inspection and dispatch at * runtime. */ #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) int usePrefetchDecoder = dctx->ddictIsCold; #else /* Set to 1 to avoid computing offset info if we don't need to. * Otherwise this value is ignored. */ int usePrefetchDecoder = 1; #endif int nbSeq; size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); if (ZSTD_isError(seqHSize)) return seqHSize; ip += seqHSize; srcSize -= seqHSize; RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, "invalid dst"); /* If we could potentially have long offsets, or we might want to use the prefetch decoder, * compute information about the share of long offsets, and the maximum nbAdditionalBits. * NOTE: could probably use a larger nbSeq limit */ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { /* If isLongOffset, but the maximum number of additional bits that we see in our table is small * enough, then we know it is impossible to have too long an offset in this block, so we can * use the regular offset decoder. */ isLongOffset = ZSTD_lo_isRegularOffset; } if (!usePrefetchDecoder) { U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ usePrefetchDecoder = (info.longOffsetShare >= minShare); } } dctx->ddictIsCold = 0; #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) if (usePrefetchDecoder) { #else (void)usePrefetchDecoder; { #endif #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); #endif } #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG /* else */ if (dctx->litBufferLocation == ZSTD_split) return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); else return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); #endif } } ZSTD_ALLOW_POINTER_OVERFLOW_ATTR void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) { if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ dctx->dictEnd = dctx->previousDstEnd; dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); dctx->prefixStart = dst; dctx->previousDstEnd = dst; } } size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { size_t dSize; dctx->isFrameDecompression = 0; ZSTD_checkContinuity(dctx, dst, dstCapacity); dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); FORWARD_IF_ERROR(dSize, ""); dctx->previousDstEnd = (char*)dst + dSize; return dSize; } /* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) { return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); }
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* X.509 certificate parser internal definitions * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/cleanup.h> #include <linux/time.h> #include <crypto/public_key.h> #include <keys/asymmetric-type.h> struct x509_certificate { struct x509_certificate *next; struct x509_certificate *signer; /* Certificate that signed this one */ struct public_key *pub; /* Public key details */ struct public_key_signature *sig; /* Signature parameters */ char *issuer; /* Name of certificate issuer */ char *subject; /* Name of certificate subject */ struct asymmetric_key_id *id; /* Issuer + Serial number */ struct asymmetric_key_id *skid; /* Subject + subjectKeyId (optional) */ time64_t valid_from; time64_t valid_to; const void *tbs; /* Signed data */ unsigned tbs_size; /* Size of signed data */ unsigned raw_sig_size; /* Size of signature */ const void *raw_sig; /* Signature data */ const void *raw_serial; /* Raw serial number in ASN.1 */ unsigned raw_serial_size; unsigned raw_issuer_size; const void *raw_issuer; /* Raw issuer name in ASN.1 */ const void *raw_subject; /* Raw subject name in ASN.1 */ unsigned raw_subject_size; unsigned raw_skid_size; const void *raw_skid; /* Raw subjectKeyId in ASN.1 */ unsigned index; bool seen; /* Infinite recursion prevention */ bool verified; bool self_signed; /* T if self-signed (check unsupported_sig too) */ bool unsupported_sig; /* T if signature uses unsupported crypto */ bool blacklisted; }; /* * x509_cert_parser.c */ extern void x509_free_certificate(struct x509_certificate *cert); DEFINE_FREE(x509_free_certificate, struct x509_certificate *, if (!IS_ERR(_T)) x509_free_certificate(_T)) extern struct x509_certificate *x509_cert_parse(const void *data, size_t datalen); extern int x509_decode_time(time64_t *_t, size_t hdrlen, unsigned char tag, const unsigned char *value, size_t vlen); /* * x509_public_key.c */ extern int x509_get_sig_params(struct x509_certificate *cert); extern int x509_check_for_self_signed(struct x509_certificate *cert);
7 1 1 4 1 3 1 3 3 3 6 2 4 9 1 8 4 1 2 5 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 /* * FUSE: Filesystem in Userspace * Copyright (C) 2016 Canonical Ltd. <seth.forshee@canonical.com> * * This program can be distributed under the terms of the GNU GPL. * See the file COPYING. */ #include "fuse_i.h" #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, struct inode *inode, int type, bool rcu) { int size; const char *name; void *value = NULL; struct posix_acl *acl; if (rcu) return ERR_PTR(-ECHILD); if (fuse_is_bad(inode)) return ERR_PTR(-EIO); if (fc->no_getxattr) return NULL; if (type == ACL_TYPE_ACCESS) name = XATTR_NAME_POSIX_ACL_ACCESS; else if (type == ACL_TYPE_DEFAULT) name = XATTR_NAME_POSIX_ACL_DEFAULT; else return ERR_PTR(-EOPNOTSUPP); value = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); size = fuse_getxattr(inode, name, value, PAGE_SIZE); if (size > 0) acl = posix_acl_from_xattr(fc->user_ns, value, size); else if ((size == 0) || (size == -ENODATA) || (size == -EOPNOTSUPP && fc->no_getxattr)) acl = NULL; else if (size == -ERANGE) acl = ERR_PTR(-E2BIG); else acl = ERR_PTR(size); kfree(value); return acl; } static inline bool fuse_no_acl(const struct fuse_conn *fc, const struct inode *inode) { /* * Refuse interacting with POSIX ACLs for daemons that * don't support FUSE_POSIX_ACL and are not mounted on * the host to retain backwards compatibility. */ return !fc->posix_acl && (i_user_ns(inode) != &init_user_ns); } struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); if (fuse_no_acl(fc, inode)) return ERR_PTR(-EOPNOTSUPP); return __fuse_get_acl(fc, inode, type, false); } struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) { struct fuse_conn *fc = get_fuse_conn(inode); /* * FUSE daemons before FUSE_POSIX_ACL was introduced could get and set * POSIX ACLs without them being used for permission checking by the * vfs. Retain that behavior for backwards compatibility as there are * filesystems that do all permission checking for acls in the daemon * and not in the kernel. */ if (!fc->posix_acl) return NULL; return __fuse_get_acl(fc, inode, type, rcu); } int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); const char *name; int ret; if (fuse_is_bad(inode)) return -EIO; if (fc->no_setxattr || fuse_no_acl(fc, inode)) return -EOPNOTSUPP; if (type == ACL_TYPE_ACCESS) name = XATTR_NAME_POSIX_ACL_ACCESS; else if (type == ACL_TYPE_DEFAULT) name = XATTR_NAME_POSIX_ACL_DEFAULT; else return -EINVAL; if (acl) { unsigned int extra_flags = 0; /* * Fuse userspace is responsible for updating access * permissions in the inode, if needed. fuse_setxattr * invalidates the inode attributes, which will force * them to be refreshed the next time they are used, * and it also updates i_ctime. */ size_t size = posix_acl_xattr_size(acl->a_count); void *value; if (size > PAGE_SIZE) return -E2BIG; value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); if (ret < 0) { kfree(value); return ret; } /* * Fuse daemons without FUSE_POSIX_ACL never changed the passed * through POSIX ACLs. Such daemons don't expect setgid bits to * be stripped. */ if (fc->posix_acl && !in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); kfree(value); } else { ret = fuse_removexattr(inode, name); } if (fc->posix_acl) { /* * Fuse daemons without FUSE_POSIX_ACL never cached POSIX ACLs * and didn't invalidate attributes. Retain that behavior. */ forget_all_cached_acls(inode); fuse_invalidate_attr(inode); } return ret; }
69 2 1 1 2 1 49 6 92 93 93 93 1 92 10 88 29 64 64 49 6 4 4 1 4 2 34 96 4 16 19 14 51 3 1 5 4 10 7 17 2 2 3 2 20 14 4 103 97 27 103 94 89 101 11 117 116 117 16 9 14 105 59 103 102 1 103 103 23 19 11 4 4 1 1 1 42 42 39 3 3 2 3 3 44 44 7 35 23 15 40 32 32 25 7 37 13 38 28 17 18 21 39 34 34 3 31 34 33 34 34 17 5 3 4 15 2 8 11 6 1 9 4 5 9 2 2 7 2 6 1 4 2 5 3 32 32 31 1 10 2 19 1 30 42 42 42 7 42 41 42 16 2 9 1 42 42 42 42 42 42 1 2 2 2 2 3 1 2 2 4 1 3 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 76 75 76 75 74 76 75 76 75 75 76 74 76 76 75 45 35 76 76 76 76 76 75 75 46 67 30 59 45 64 66 73 56 73 75 75 56 57 1 1 1 12 7 5 2 92 6 91 1 1 41 30 13 38 5 43 43 43 42 42 43 43 43 34 34 33 34 34 34 1 1 43 43 1 41 29 14 43 43 43 43 43 43 2 41 2 41 43 43 42 42 42 42 1 33 33 130 130 126 130 130 83 83 22 41 42 42 42 42 42 42 12 34 34 34 34 42 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 // SPDX-License-Identifier: GPL-2.0-only /* * The input core * * Copyright (c) 1999-2002 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_BASENAME ": " fmt #include <linux/export.h> #include <linux/init.h> #include <linux/types.h> #include <linux/idr.h> #include <linux/input/mt.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/major.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/pm.h> #include <linux/poll.h> #include <linux/device.h> #include <linux/kstrtox.h> #include <linux/mutex.h> #include <linux/rcupdate.h> #include "input-compat.h" #include "input-core-private.h" #include "input-poller.h" MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>"); MODULE_DESCRIPTION("Input core"); MODULE_LICENSE("GPL"); #define INPUT_MAX_CHAR_DEVICES 1024 #define INPUT_FIRST_DYNAMIC_DEV 256 static DEFINE_IDA(input_ida); static LIST_HEAD(input_dev_list); static LIST_HEAD(input_handler_list); /* * input_mutex protects access to both input_dev_list and input_handler_list. * This also causes input_[un]register_device and input_[un]register_handler * be mutually exclusive which simplifies locking in drivers implementing * input handlers. */ static DEFINE_MUTEX(input_mutex); static const struct input_value input_value_sync = { EV_SYN, SYN_REPORT, 1 }; static const unsigned int input_max_code[EV_CNT] = { [EV_KEY] = KEY_MAX, [EV_REL] = REL_MAX, [EV_ABS] = ABS_MAX, [EV_MSC] = MSC_MAX, [EV_SW] = SW_MAX, [EV_LED] = LED_MAX, [EV_SND] = SND_MAX, [EV_FF] = FF_MAX, }; static inline int is_event_supported(unsigned int code, unsigned long *bm, unsigned int max) { return code <= max && test_bit(code, bm); } static int input_defuzz_abs_event(int value, int old_val, int fuzz) { if (fuzz) { if (value > old_val - fuzz / 2 && value < old_val + fuzz / 2) return old_val; if (value > old_val - fuzz && value < old_val + fuzz) return (old_val * 3 + value) / 4; if (value > old_val - fuzz * 2 && value < old_val + fuzz * 2) return (old_val + value) / 2; } return value; } static void input_start_autorepeat(struct input_dev *dev, int code) { if (test_bit(EV_REP, dev->evbit) && dev->rep[REP_PERIOD] && dev->rep[REP_DELAY] && dev->timer.function) { dev->repeat_key = code; mod_timer(&dev->timer, jiffies + msecs_to_jiffies(dev->rep[REP_DELAY])); } } static void input_stop_autorepeat(struct input_dev *dev) { timer_delete(&dev->timer); } /* * Pass values first through all filters and then, if event has not been * filtered out, through all open handles. This order is achieved by placing * filters at the head of the list of handles attached to the device, and * placing regular handles at the tail of the list. * * This function is called with dev->event_lock held and interrupts disabled. */ static void input_pass_values(struct input_dev *dev, struct input_value *vals, unsigned int count) { struct input_handle *handle; struct input_value *v; lockdep_assert_held(&dev->event_lock); scoped_guard(rcu) { handle = rcu_dereference(dev->grab); if (handle) { count = handle->handle_events(handle, vals, count); break; } list_for_each_entry_rcu(handle, &dev->h_list, d_node) { if (handle->open) { count = handle->handle_events(handle, vals, count); if (!count) break; } } } /* trigger auto repeat for key events */ if (test_bit(EV_REP, dev->evbit) && test_bit(EV_KEY, dev->evbit)) { for (v = vals; v != vals + count; v++) { if (v->type == EV_KEY && v->value != 2) { if (v->value) input_start_autorepeat(dev, v->code); else input_stop_autorepeat(dev); } } } } #define INPUT_IGNORE_EVENT 0 #define INPUT_PASS_TO_HANDLERS 1 #define INPUT_PASS_TO_DEVICE 2 #define INPUT_SLOT 4 #define INPUT_FLUSH 8 #define INPUT_PASS_TO_ALL (INPUT_PASS_TO_HANDLERS | INPUT_PASS_TO_DEVICE) static int input_handle_abs_event(struct input_dev *dev, unsigned int code, int *pval) { struct input_mt *mt = dev->mt; bool is_new_slot = false; bool is_mt_event; int *pold; if (code == ABS_MT_SLOT) { /* * "Stage" the event; we'll flush it later, when we * get actual touch data. */ if (mt && *pval >= 0 && *pval < mt->num_slots) mt->slot = *pval; return INPUT_IGNORE_EVENT; } is_mt_event = input_is_mt_value(code); if (!is_mt_event) { pold = &dev->absinfo[code].value; } else if (mt) { pold = &mt->slots[mt->slot].abs[code - ABS_MT_FIRST]; is_new_slot = mt->slot != dev->absinfo[ABS_MT_SLOT].value; } else { /* * Bypass filtering for multi-touch events when * not employing slots. */ pold = NULL; } if (pold) { *pval = input_defuzz_abs_event(*pval, *pold, dev->absinfo[code].fuzz); if (*pold == *pval) return INPUT_IGNORE_EVENT; *pold = *pval; } /* Flush pending "slot" event */ if (is_new_slot) { dev->absinfo[ABS_MT_SLOT].value = mt->slot; return INPUT_PASS_TO_HANDLERS | INPUT_SLOT; } return INPUT_PASS_TO_HANDLERS; } static int input_get_disposition(struct input_dev *dev, unsigned int type, unsigned int code, int *pval) { int disposition = INPUT_IGNORE_EVENT; int value = *pval; /* filter-out events from inhibited devices */ if (dev->inhibited) return INPUT_IGNORE_EVENT; switch (type) { case EV_SYN: switch (code) { case SYN_CONFIG: disposition = INPUT_PASS_TO_ALL; break; case SYN_REPORT: disposition = INPUT_PASS_TO_HANDLERS | INPUT_FLUSH; break; case SYN_MT_REPORT: disposition = INPUT_PASS_TO_HANDLERS; break; } break; case EV_KEY: if (is_event_supported(code, dev->keybit, KEY_MAX)) { /* auto-repeat bypasses state updates */ if (value == 2) { disposition = INPUT_PASS_TO_HANDLERS; break; } if (!!test_bit(code, dev->key) != !!value) { __change_bit(code, dev->key); disposition = INPUT_PASS_TO_HANDLERS; } } break; case EV_SW: if (is_event_supported(code, dev->swbit, SW_MAX) && !!test_bit(code, dev->sw) != !!value) { __change_bit(code, dev->sw); disposition = INPUT_PASS_TO_HANDLERS; } break; case EV_ABS: if (is_event_supported(code, dev->absbit, ABS_MAX)) disposition = input_handle_abs_event(dev, code, &value); break; case EV_REL: if (is_event_supported(code, dev->relbit, REL_MAX) && value) disposition = INPUT_PASS_TO_HANDLERS; break; case EV_MSC: if (is_event_supported(code, dev->mscbit, MSC_MAX)) disposition = INPUT_PASS_TO_ALL; break; case EV_LED: if (is_event_supported(code, dev->ledbit, LED_MAX) && !!test_bit(code, dev->led) != !!value) { __change_bit(code, dev->led); disposition = INPUT_PASS_TO_ALL; } break; case EV_SND: if (is_event_supported(code, dev->sndbit, SND_MAX)) { if (!!test_bit(code, dev->snd) != !!value) __change_bit(code, dev->snd); disposition = INPUT_PASS_TO_ALL; } break; case EV_REP: if (code <= REP_MAX && value >= 0 && dev->rep[code] != value) { dev->rep[code] = value; disposition = INPUT_PASS_TO_ALL; } break; case EV_FF: if (value >= 0) disposition = INPUT_PASS_TO_ALL; break; case EV_PWR: disposition = INPUT_PASS_TO_ALL; break; } *pval = value; return disposition; } static void input_event_dispose(struct input_dev *dev, int disposition, unsigned int type, unsigned int code, int value) { if ((disposition & INPUT_PASS_TO_DEVICE) && dev->event) dev->event(dev, type, code, value); if (disposition & INPUT_PASS_TO_HANDLERS) { struct input_value *v; if (disposition & INPUT_SLOT) { v = &dev->vals[dev->num_vals++]; v->type = EV_ABS; v->code = ABS_MT_SLOT; v->value = dev->mt->slot; } v = &dev->vals[dev->num_vals++]; v->type = type; v->code = code; v->value = value; } if (disposition & INPUT_FLUSH) { if (dev->num_vals >= 2) input_pass_values(dev, dev->vals, dev->num_vals); dev->num_vals = 0; /* * Reset the timestamp on flush so we won't end up * with a stale one. Note we only need to reset the * monolithic one as we use its presence when deciding * whether to generate a synthetic timestamp. */ dev->timestamp[INPUT_CLK_MONO] = ktime_set(0, 0); } else if (dev->num_vals >= dev->max_vals - 2) { dev->vals[dev->num_vals++] = input_value_sync; input_pass_values(dev, dev->vals, dev->num_vals); dev->num_vals = 0; } } void input_handle_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { int disposition; lockdep_assert_held(&dev->event_lock); disposition = input_get_disposition(dev, type, code, &value); if (disposition != INPUT_IGNORE_EVENT) { if (type != EV_SYN) add_input_randomness(type, code, value); input_event_dispose(dev, disposition, type, code, value); } } /** * input_event() - report new input event * @dev: device that generated the event * @type: type of the event * @code: event code * @value: value of the event * * This function should be used by drivers implementing various input * devices to report input events. See also input_inject_event(). * * NOTE: input_event() may be safely used right after input device was * allocated with input_allocate_device(), even before it is registered * with input_register_device(), but the event will not reach any of the * input handlers. Such early invocation of input_event() may be used * to 'seed' initial state of a switch or initial position of absolute * axis, etc. */ void input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { if (is_event_supported(type, dev->evbit, EV_MAX)) { guard(spinlock_irqsave)(&dev->event_lock); input_handle_event(dev, type, code, value); } } EXPORT_SYMBOL(input_event); /** * input_inject_event() - send input event from input handler * @handle: input handle to send event through * @type: type of the event * @code: event code * @value: value of the event * * Similar to input_event() but will ignore event if device is * "grabbed" and handle injecting event is not the one that owns * the device. */ void input_inject_event(struct input_handle *handle, unsigned int type, unsigned int code, int value) { struct input_dev *dev = handle->dev; struct input_handle *grab; if (is_event_supported(type, dev->evbit, EV_MAX)) { guard(spinlock_irqsave)(&dev->event_lock); guard(rcu)(); grab = rcu_dereference(dev->grab); if (!grab || grab == handle) input_handle_event(dev, type, code, value); } } EXPORT_SYMBOL(input_inject_event); /** * input_alloc_absinfo - allocates array of input_absinfo structs * @dev: the input device emitting absolute events * * If the absinfo struct the caller asked for is already allocated, this * functions will not do anything. */ void input_alloc_absinfo(struct input_dev *dev) { if (dev->absinfo) return; dev->absinfo = kcalloc(ABS_CNT, sizeof(*dev->absinfo), GFP_KERNEL); if (!dev->absinfo) { dev_err(dev->dev.parent ?: &dev->dev, "%s: unable to allocate memory\n", __func__); /* * We will handle this allocation failure in * input_register_device() when we refuse to register input * device with ABS bits but without absinfo. */ } } EXPORT_SYMBOL(input_alloc_absinfo); void input_set_abs_params(struct input_dev *dev, unsigned int axis, int min, int max, int fuzz, int flat) { struct input_absinfo *absinfo; __set_bit(EV_ABS, dev->evbit); __set_bit(axis, dev->absbit); input_alloc_absinfo(dev); if (!dev->absinfo) return; absinfo = &dev->absinfo[axis]; absinfo->minimum = min; absinfo->maximum = max; absinfo->fuzz = fuzz; absinfo->flat = flat; } EXPORT_SYMBOL(input_set_abs_params); /** * input_copy_abs - Copy absinfo from one input_dev to another * @dst: Destination input device to copy the abs settings to * @dst_axis: ABS_* value selecting the destination axis * @src: Source input device to copy the abs settings from * @src_axis: ABS_* value selecting the source axis * * Set absinfo for the selected destination axis by copying it from * the specified source input device's source axis. * This is useful to e.g. setup a pen/stylus input-device for combined * touchscreen/pen hardware where the pen uses the same coordinates as * the touchscreen. */ void input_copy_abs(struct input_dev *dst, unsigned int dst_axis, const struct input_dev *src, unsigned int src_axis) { /* src must have EV_ABS and src_axis set */ if (WARN_ON(!(test_bit(EV_ABS, src->evbit) && test_bit(src_axis, src->absbit)))) return; /* * input_alloc_absinfo() may have failed for the source. Our caller is * expected to catch this when registering the input devices, which may * happen after the input_copy_abs() call. */ if (!src->absinfo) return; input_set_capability(dst, EV_ABS, dst_axis); if (!dst->absinfo) return; dst->absinfo[dst_axis] = src->absinfo[src_axis]; } EXPORT_SYMBOL(input_copy_abs); /** * input_grab_device - grabs device for exclusive use * @handle: input handle that wants to own the device * * When a device is grabbed by an input handle all events generated by * the device are delivered only to this handle. Also events injected * by other input handles are ignored while device is grabbed. */ int input_grab_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { if (dev->grab) return -EBUSY; rcu_assign_pointer(dev->grab, handle); } return 0; } EXPORT_SYMBOL(input_grab_device); static void __input_release_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; struct input_handle *grabber; grabber = rcu_dereference_protected(dev->grab, lockdep_is_held(&dev->mutex)); if (grabber == handle) { rcu_assign_pointer(dev->grab, NULL); /* Make sure input_pass_values() notices that grab is gone */ synchronize_rcu(); list_for_each_entry(handle, &dev->h_list, d_node) if (handle->open && handle->handler->start) handle->handler->start(handle); } } /** * input_release_device - release previously grabbed device * @handle: input handle that owns the device * * Releases previously grabbed device so that other input handles can * start receiving input events. Upon release all handlers attached * to the device have their start() method called so they have a change * to synchronize device state with the rest of the system. */ void input_release_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; guard(mutex)(&dev->mutex); __input_release_device(handle); } EXPORT_SYMBOL(input_release_device); /** * input_open_device - open input device * @handle: handle through which device is being accessed * * This function should be called by input handlers when they * want to start receive events from given input device. */ int input_open_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; int error; scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { if (dev->going_away) return -ENODEV; handle->open++; if (handle->handler->passive_observer) return 0; if (dev->users++ || dev->inhibited) { /* * Device is already opened and/or inhibited, * so we can exit immediately and report success. */ return 0; } if (dev->open) { error = dev->open(dev); if (error) { dev->users--; handle->open--; /* * Make sure we are not delivering any more * events through this handle. */ synchronize_rcu(); return error; } } if (dev->poller) input_dev_poller_start(dev->poller); } return 0; } EXPORT_SYMBOL(input_open_device); int input_flush_device(struct input_handle *handle, struct file *file) { struct input_dev *dev = handle->dev; scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { if (dev->flush) return dev->flush(dev, file); } return 0; } EXPORT_SYMBOL(input_flush_device); /** * input_close_device - close input device * @handle: handle through which device is being accessed * * This function should be called by input handlers when they * want to stop receive events from given input device. */ void input_close_device(struct input_handle *handle) { struct input_dev *dev = handle->dev; guard(mutex)(&dev->mutex); __input_release_device(handle); if (!handle->handler->passive_observer) { if (!--dev->users && !dev->inhibited) { if (dev->poller) input_dev_poller_stop(dev->poller); if (dev->close) dev->close(dev); } } if (!--handle->open) { /* * synchronize_rcu() makes sure that input_pass_values() * completed and that no more input events are delivered * through this handle */ synchronize_rcu(); } } EXPORT_SYMBOL(input_close_device); /* * Simulate keyup events for all keys that are marked as pressed. * The function must be called with dev->event_lock held. */ static bool input_dev_release_keys(struct input_dev *dev) { bool need_sync = false; int code; lockdep_assert_held(&dev->event_lock); if (is_event_supported(EV_KEY, dev->evbit, EV_MAX)) { for_each_set_bit(code, dev->key, KEY_CNT) { input_handle_event(dev, EV_KEY, code, 0); need_sync = true; } } return need_sync; } /* * Prepare device for unregistering */ static void input_disconnect_device(struct input_dev *dev) { struct input_handle *handle; /* * Mark device as going away. Note that we take dev->mutex here * not to protect access to dev->going_away but rather to ensure * that there are no threads in the middle of input_open_device() */ scoped_guard(mutex, &dev->mutex) dev->going_away = true; guard(spinlock_irq)(&dev->event_lock); /* * Simulate keyup events for all pressed keys so that handlers * are not left with "stuck" keys. The driver may continue * generate events even after we done here but they will not * reach any handlers. */ if (input_dev_release_keys(dev)) input_handle_event(dev, EV_SYN, SYN_REPORT, 1); list_for_each_entry(handle, &dev->h_list, d_node) handle->open = 0; } /** * input_scancode_to_scalar() - converts scancode in &struct input_keymap_entry * @ke: keymap entry containing scancode to be converted. * @scancode: pointer to the location where converted scancode should * be stored. * * This function is used to convert scancode stored in &struct keymap_entry * into scalar form understood by legacy keymap handling methods. These * methods expect scancodes to be represented as 'unsigned int'. */ int input_scancode_to_scalar(const struct input_keymap_entry *ke, unsigned int *scancode) { switch (ke->len) { case 1: *scancode = *((u8 *)ke->scancode); break; case 2: *scancode = *((u16 *)ke->scancode); break; case 4: *scancode = *((u32 *)ke->scancode); break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL(input_scancode_to_scalar); /* * Those routines handle the default case where no [gs]etkeycode() is * defined. In this case, an array indexed by the scancode is used. */ static unsigned int input_fetch_keycode(struct input_dev *dev, unsigned int index) { switch (dev->keycodesize) { case 1: return ((u8 *)dev->keycode)[index]; case 2: return ((u16 *)dev->keycode)[index]; default: return ((u32 *)dev->keycode)[index]; } } static int input_default_getkeycode(struct input_dev *dev, struct input_keymap_entry *ke) { unsigned int index; int error; if (!dev->keycodesize) return -EINVAL; if (ke->flags & INPUT_KEYMAP_BY_INDEX) index = ke->index; else { error = input_scancode_to_scalar(ke, &index); if (error) return error; } if (index >= dev->keycodemax) return -EINVAL; ke->keycode = input_fetch_keycode(dev, index); ke->index = index; ke->len = sizeof(index); memcpy(ke->scancode, &index, sizeof(index)); return 0; } static int input_default_setkeycode(struct input_dev *dev, const struct input_keymap_entry *ke, unsigned int *old_keycode) { unsigned int index; int error; int i; if (!dev->keycodesize) return -EINVAL; if (ke->flags & INPUT_KEYMAP_BY_INDEX) { index = ke->index; } else { error = input_scancode_to_scalar(ke, &index); if (error) return error; } if (index >= dev->keycodemax) return -EINVAL; if (dev->keycodesize < sizeof(ke->keycode) && (ke->keycode >> (dev->keycodesize * 8))) return -EINVAL; switch (dev->keycodesize) { case 1: { u8 *k = (u8 *)dev->keycode; *old_keycode = k[index]; k[index] = ke->keycode; break; } case 2: { u16 *k = (u16 *)dev->keycode; *old_keycode = k[index]; k[index] = ke->keycode; break; } default: { u32 *k = (u32 *)dev->keycode; *old_keycode = k[index]; k[index] = ke->keycode; break; } } if (*old_keycode <= KEY_MAX) { __clear_bit(*old_keycode, dev->keybit); for (i = 0; i < dev->keycodemax; i++) { if (input_fetch_keycode(dev, i) == *old_keycode) { __set_bit(*old_keycode, dev->keybit); /* Setting the bit twice is useless, so break */ break; } } } __set_bit(ke->keycode, dev->keybit); return 0; } /** * input_get_keycode - retrieve keycode currently mapped to a given scancode * @dev: input device which keymap is being queried * @ke: keymap entry * * This function should be called by anyone interested in retrieving current * keymap. Presently evdev handlers use it. */ int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke) { guard(spinlock_irqsave)(&dev->event_lock); return dev->getkeycode(dev, ke); } EXPORT_SYMBOL(input_get_keycode); /** * input_set_keycode - attribute a keycode to a given scancode * @dev: input device which keymap is being updated * @ke: new keymap entry * * This function should be called by anyone needing to update current * keymap. Presently keyboard and evdev handlers use it. */ int input_set_keycode(struct input_dev *dev, const struct input_keymap_entry *ke) { unsigned int old_keycode; int error; if (ke->keycode > KEY_MAX) return -EINVAL; guard(spinlock_irqsave)(&dev->event_lock); error = dev->setkeycode(dev, ke, &old_keycode); if (error) return error; /* Make sure KEY_RESERVED did not get enabled. */ __clear_bit(KEY_RESERVED, dev->keybit); /* * Simulate keyup event if keycode is not present * in the keymap anymore */ if (old_keycode > KEY_MAX) { dev_warn(dev->dev.parent ?: &dev->dev, "%s: got too big old keycode %#x\n", __func__, old_keycode); } else if (test_bit(EV_KEY, dev->evbit) && !is_event_supported(old_keycode, dev->keybit, KEY_MAX) && __test_and_clear_bit(old_keycode, dev->key)) { /* * We have to use input_event_dispose() here directly instead * of input_handle_event() because the key we want to release * here is considered no longer supported by the device and * input_handle_event() will ignore it. */ input_event_dispose(dev, INPUT_PASS_TO_HANDLERS, EV_KEY, old_keycode, 0); input_event_dispose(dev, INPUT_PASS_TO_HANDLERS | INPUT_FLUSH, EV_SYN, SYN_REPORT, 1); } return 0; } EXPORT_SYMBOL(input_set_keycode); bool input_match_device_id(const struct input_dev *dev, const struct input_device_id *id) { if (id->flags & INPUT_DEVICE_ID_MATCH_BUS) if (id->bustype != dev->id.bustype) return false; if (id->flags & INPUT_DEVICE_ID_MATCH_VENDOR) if (id->vendor != dev->id.vendor) return false; if (id->flags & INPUT_DEVICE_ID_MATCH_PRODUCT) if (id->product != dev->id.product) return false; if (id->flags & INPUT_DEVICE_ID_MATCH_VERSION) if (id->version != dev->id.version) return false; if (!bitmap_subset(id->evbit, dev->evbit, EV_MAX) || !bitmap_subset(id->keybit, dev->keybit, KEY_MAX) || !bitmap_subset(id->relbit, dev->relbit, REL_MAX) || !bitmap_subset(id->absbit, dev->absbit, ABS_MAX) || !bitmap_subset(id->mscbit, dev->mscbit, MSC_MAX) || !bitmap_subset(id->ledbit, dev->ledbit, LED_MAX) || !bitmap_subset(id->sndbit, dev->sndbit, SND_MAX) || !bitmap_subset(id->ffbit, dev->ffbit, FF_MAX) || !bitmap_subset(id->swbit, dev->swbit, SW_MAX) || !bitmap_subset(id->propbit, dev->propbit, INPUT_PROP_MAX)) { return false; } return true; } EXPORT_SYMBOL(input_match_device_id); static const struct input_device_id *input_match_device(struct input_handler *handler, struct input_dev *dev) { const struct input_device_id *id; for (id = handler->id_table; id->flags; id++) { if (input_match_device_id(dev, id) && (!handler->match || handler->match(handler, dev))) { return id; } } return NULL; } static int input_attach_handler(struct input_dev *dev, struct input_handler *handler) { const struct input_device_id *id; int error; id = input_match_device(handler, dev); if (!id) return -ENODEV; error = handler->connect(handler, dev, id); if (error && error != -ENODEV) pr_err("failed to attach handler %s to device %s, error: %d\n", handler->name, kobject_name(&dev->dev.kobj), error); return error; } #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_bus_input_dir; static DECLARE_WAIT_QUEUE_HEAD(input_devices_poll_wait); static int input_devices_state; static inline void input_wakeup_procfs_readers(void) { input_devices_state++; wake_up(&input_devices_poll_wait); } struct input_seq_state { unsigned short pos; bool mutex_acquired; int input_devices_state; }; static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait) { struct seq_file *seq = file->private_data; struct input_seq_state *state = seq->private; poll_wait(file, &input_devices_poll_wait, wait); if (state->input_devices_state != input_devices_state) { state->input_devices_state = input_devices_state; return EPOLLIN | EPOLLRDNORM; } return 0; } static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos) { struct input_seq_state *state = seq->private; int error; error = mutex_lock_interruptible(&input_mutex); if (error) { state->mutex_acquired = false; return ERR_PTR(error); } state->mutex_acquired = true; return seq_list_start(&input_dev_list, *pos); } static void *input_devices_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &input_dev_list, pos); } static void input_seq_stop(struct seq_file *seq, void *v) { struct input_seq_state *state = seq->private; if (state->mutex_acquired) mutex_unlock(&input_mutex); } static void input_seq_print_bitmap(struct seq_file *seq, const char *name, unsigned long *bitmap, int max) { int i; bool skip_empty = true; char buf[18]; seq_printf(seq, "B: %s=", name); for (i = BITS_TO_LONGS(max) - 1; i >= 0; i--) { if (input_bits_to_string(buf, sizeof(buf), bitmap[i], skip_empty)) { skip_empty = false; seq_printf(seq, "%s%s", buf, i > 0 ? " " : ""); } } /* * If no output was produced print a single 0. */ if (skip_empty) seq_putc(seq, '0'); seq_putc(seq, '\n'); } static int input_devices_seq_show(struct seq_file *seq, void *v) { struct input_dev *dev = container_of(v, struct input_dev, node); const char *path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); struct input_handle *handle; seq_printf(seq, "I: Bus=%04x Vendor=%04x Product=%04x Version=%04x\n", dev->id.bustype, dev->id.vendor, dev->id.product, dev->id.version); seq_printf(seq, "N: Name=\"%s\"\n", dev->name ? dev->name : ""); seq_printf(seq, "P: Phys=%s\n", dev->phys ? dev->phys : ""); seq_printf(seq, "S: Sysfs=%s\n", path ? path : ""); seq_printf(seq, "U: Uniq=%s\n", dev->uniq ? dev->uniq : ""); seq_puts(seq, "H: Handlers="); list_for_each_entry(handle, &dev->h_list, d_node) seq_printf(seq, "%s ", handle->name); seq_putc(seq, '\n'); input_seq_print_bitmap(seq, "PROP", dev->propbit, INPUT_PROP_MAX); input_seq_print_bitmap(seq, "EV", dev->evbit, EV_MAX); if (test_bit(EV_KEY, dev->evbit)) input_seq_print_bitmap(seq, "KEY", dev->keybit, KEY_MAX); if (test_bit(EV_REL, dev->evbit)) input_seq_print_bitmap(seq, "REL", dev->relbit, REL_MAX); if (test_bit(EV_ABS, dev->evbit)) input_seq_print_bitmap(seq, "ABS", dev->absbit, ABS_MAX); if (test_bit(EV_MSC, dev->evbit)) input_seq_print_bitmap(seq, "MSC", dev->mscbit, MSC_MAX); if (test_bit(EV_LED, dev->evbit)) input_seq_print_bitmap(seq, "LED", dev->ledbit, LED_MAX); if (test_bit(EV_SND, dev->evbit)) input_seq_print_bitmap(seq, "SND", dev->sndbit, SND_MAX); if (test_bit(EV_FF, dev->evbit)) input_seq_print_bitmap(seq, "FF", dev->ffbit, FF_MAX); if (test_bit(EV_SW, dev->evbit)) input_seq_print_bitmap(seq, "SW", dev->swbit, SW_MAX); seq_putc(seq, '\n'); kfree(path); return 0; } static const struct seq_operations input_devices_seq_ops = { .start = input_devices_seq_start, .next = input_devices_seq_next, .stop = input_seq_stop, .show = input_devices_seq_show, }; static int input_proc_devices_open(struct inode *inode, struct file *file) { return seq_open_private(file, &input_devices_seq_ops, sizeof(struct input_seq_state)); } static const struct proc_ops input_devices_proc_ops = { .proc_open = input_proc_devices_open, .proc_poll = input_proc_devices_poll, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, }; static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos) { struct input_seq_state *state = seq->private; int error; error = mutex_lock_interruptible(&input_mutex); if (error) { state->mutex_acquired = false; return ERR_PTR(error); } state->mutex_acquired = true; state->pos = *pos; return seq_list_start(&input_handler_list, *pos); } static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct input_seq_state *state = seq->private; state->pos = *pos + 1; return seq_list_next(v, &input_handler_list, pos); } static int input_handlers_seq_show(struct seq_file *seq, void *v) { struct input_handler *handler = container_of(v, struct input_handler, node); struct input_seq_state *state = seq->private; seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name); if (handler->filter) seq_puts(seq, " (filter)"); if (handler->legacy_minors) seq_printf(seq, " Minor=%d", handler->minor); seq_putc(seq, '\n'); return 0; } static const struct seq_operations input_handlers_seq_ops = { .start = input_handlers_seq_start, .next = input_handlers_seq_next, .stop = input_seq_stop, .show = input_handlers_seq_show, }; static int input_proc_handlers_open(struct inode *inode, struct file *file) { return seq_open_private(file, &input_handlers_seq_ops, sizeof(struct input_seq_state)); } static const struct proc_ops input_handlers_proc_ops = { .proc_open = input_proc_handlers_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = seq_release_private, }; static int __init input_proc_init(void) { struct proc_dir_entry *entry; proc_bus_input_dir = proc_mkdir("bus/input", NULL); if (!proc_bus_input_dir) return -ENOMEM; entry = proc_create("devices", 0, proc_bus_input_dir, &input_devices_proc_ops); if (!entry) goto fail1; entry = proc_create("handlers", 0, proc_bus_input_dir, &input_handlers_proc_ops); if (!entry) goto fail2; return 0; fail2: remove_proc_entry("devices", proc_bus_input_dir); fail1: remove_proc_entry("bus/input", NULL); return -ENOMEM; } static void input_proc_exit(void) { remove_proc_entry("devices", proc_bus_input_dir); remove_proc_entry("handlers", proc_bus_input_dir); remove_proc_entry("bus/input", NULL); } #else /* !CONFIG_PROC_FS */ static inline void input_wakeup_procfs_readers(void) { } static inline int input_proc_init(void) { return 0; } static inline void input_proc_exit(void) { } #endif #define INPUT_DEV_STRING_ATTR_SHOW(name) \ static ssize_t input_dev_show_##name(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct input_dev *input_dev = to_input_dev(dev); \ \ return sysfs_emit(buf, "%s\n", \ input_dev->name ? input_dev->name : ""); \ } \ static DEVICE_ATTR(name, S_IRUGO, input_dev_show_##name, NULL) INPUT_DEV_STRING_ATTR_SHOW(name); INPUT_DEV_STRING_ATTR_SHOW(phys); INPUT_DEV_STRING_ATTR_SHOW(uniq); static int input_print_modalias_bits(char *buf, int size, char name, const unsigned long *bm, unsigned int min_bit, unsigned int max_bit) { int bit = min_bit; int len = 0; len += snprintf(buf, max(size, 0), "%c", name); for_each_set_bit_from(bit, bm, max_bit) len += snprintf(buf + len, max(size - len, 0), "%X,", bit); return len; } static int input_print_modalias_parts(char *buf, int size, int full_len, const struct input_dev *id) { int len, klen, remainder, space; len = snprintf(buf, max(size, 0), "input:b%04Xv%04Xp%04Xe%04X-", id->id.bustype, id->id.vendor, id->id.product, id->id.version); len += input_print_modalias_bits(buf + len, size - len, 'e', id->evbit, 0, EV_MAX); /* * Calculate the remaining space in the buffer making sure we * have place for the terminating 0. */ space = max(size - (len + 1), 0); klen = input_print_modalias_bits(buf + len, size - len, 'k', id->keybit, KEY_MIN_INTERESTING, KEY_MAX); len += klen; /* * If we have more data than we can fit in the buffer, check * if we can trim key data to fit in the rest. We will indicate * that key data is incomplete by adding "+" sign at the end, like * this: * "k1,2,3,45,+,". * * Note that we shortest key info (if present) is "k+," so we * can only try to trim if key data is longer than that. */ if (full_len && size < full_len + 1 && klen > 3) { remainder = full_len - len; /* * We can only trim if we have space for the remainder * and also for at least "k+," which is 3 more characters. */ if (remainder <= space - 3) { /* * We are guaranteed to have 'k' in the buffer, so * we need at least 3 additional bytes for storing * "+," in addition to the remainder. */ for (int i = size - 1 - remainder - 3; i >= 0; i--) { if (buf[i] == 'k' || buf[i] == ',') { strcpy(buf + i + 1, "+,"); len = i + 3; /* Not counting '\0' */ break; } } } } len += input_print_modalias_bits(buf + len, size - len, 'r', id->relbit, 0, REL_MAX); len += input_print_modalias_bits(buf + len, size - len, 'a', id->absbit, 0, ABS_MAX); len += input_print_modalias_bits(buf + len, size - len, 'm', id->mscbit, 0, MSC_MAX); len += input_print_modalias_bits(buf + len, size - len, 'l', id->ledbit, 0, LED_MAX); len += input_print_modalias_bits(buf + len, size - len, 's', id->sndbit, 0, SND_MAX); len += input_print_modalias_bits(buf + len, size - len, 'f', id->ffbit, 0, FF_MAX); len += input_print_modalias_bits(buf + len, size - len, 'w', id->swbit, 0, SW_MAX); return len; } static int input_print_modalias(char *buf, int size, const struct input_dev *id) { int full_len; /* * Printing is done in 2 passes: first one figures out total length * needed for the modalias string, second one will try to trim key * data in case when buffer is too small for the entire modalias. * If the buffer is too small regardless, it will fill as much as it * can (without trimming key data) into the buffer and leave it to * the caller to figure out what to do with the result. */ full_len = input_print_modalias_parts(NULL, 0, 0, id); return input_print_modalias_parts(buf, size, full_len, id); } static ssize_t input_dev_show_modalias(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *id = to_input_dev(dev); ssize_t len; len = input_print_modalias(buf, PAGE_SIZE, id); if (len < PAGE_SIZE - 2) len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return min_t(int, len, PAGE_SIZE); } static DEVICE_ATTR(modalias, S_IRUGO, input_dev_show_modalias, NULL); static int input_print_bitmap(char *buf, int buf_size, const unsigned long *bitmap, int max, int add_cr); static ssize_t input_dev_show_properties(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *input_dev = to_input_dev(dev); int len = input_print_bitmap(buf, PAGE_SIZE, input_dev->propbit, INPUT_PROP_MAX, true); return min_t(int, len, PAGE_SIZE); } static DEVICE_ATTR(properties, S_IRUGO, input_dev_show_properties, NULL); static int input_inhibit_device(struct input_dev *dev); static int input_uninhibit_device(struct input_dev *dev); static ssize_t inhibited_show(struct device *dev, struct device_attribute *attr, char *buf) { struct input_dev *input_dev = to_input_dev(dev); return sysfs_emit(buf, "%d\n", input_dev->inhibited); } static ssize_t inhibited_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct input_dev *input_dev = to_input_dev(dev); ssize_t rv; bool inhibited; if (kstrtobool(buf, &inhibited)) return -EINVAL; if (inhibited) rv = input_inhibit_device(input_dev); else rv = input_uninhibit_device(input_dev); if (rv != 0) return rv; return len; } static DEVICE_ATTR_RW(inhibited); static struct attribute *input_dev_attrs[] = { &dev_attr_name.attr, &dev_attr_phys.attr, &dev_attr_uniq.attr, &dev_attr_modalias.attr, &dev_attr_properties.attr, &dev_attr_inhibited.attr, NULL }; static const struct attribute_group input_dev_attr_group = { .attrs = input_dev_attrs, }; #define INPUT_DEV_ID_ATTR(name) \ static ssize_t input_dev_show_id_##name(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct input_dev *input_dev = to_input_dev(dev); \ return sysfs_emit(buf, "%04x\n", input_dev->id.name); \ } \ static DEVICE_ATTR(name, S_IRUGO, input_dev_show_id_##name, NULL) INPUT_DEV_ID_ATTR(bustype); INPUT_DEV_ID_ATTR(vendor); INPUT_DEV_ID_ATTR(product); INPUT_DEV_ID_ATTR(version); static struct attribute *input_dev_id_attrs[] = { &dev_attr_bustype.attr, &dev_attr_vendor.attr, &dev_attr_product.attr, &dev_attr_version.attr, NULL }; static const struct attribute_group input_dev_id_attr_group = { .name = "id", .attrs = input_dev_id_attrs, }; static int input_print_bitmap(char *buf, int buf_size, const unsigned long *bitmap, int max, int add_cr) { int i; int len = 0; bool skip_empty = true; for (i = BITS_TO_LONGS(max) - 1; i >= 0; i--) { len += input_bits_to_string(buf + len, max(buf_size - len, 0), bitmap[i], skip_empty); if (len) { skip_empty = false; if (i > 0) len += snprintf(buf + len, max(buf_size - len, 0), " "); } } /* * If no output was produced print a single 0. */ if (len == 0) len = snprintf(buf, buf_size, "%d", 0); if (add_cr) len += snprintf(buf + len, max(buf_size - len, 0), "\n"); return len; } #define INPUT_DEV_CAP_ATTR(ev, bm) \ static ssize_t input_dev_show_cap_##bm(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct input_dev *input_dev = to_input_dev(dev); \ int len = input_print_bitmap(buf, PAGE_SIZE, \ input_dev->bm##bit, ev##_MAX, \ true); \ return min_t(int, len, PAGE_SIZE); \ } \ static DEVICE_ATTR(bm, S_IRUGO, input_dev_show_cap_##bm, NULL) INPUT_DEV_CAP_ATTR(EV, ev); INPUT_DEV_CAP_ATTR(KEY, key); INPUT_DEV_CAP_ATTR(REL, rel); INPUT_DEV_CAP_ATTR(ABS, abs); INPUT_DEV_CAP_ATTR(MSC, msc); INPUT_DEV_CAP_ATTR(LED, led); INPUT_DEV_CAP_ATTR(SND, snd); INPUT_DEV_CAP_ATTR(FF, ff); INPUT_DEV_CAP_ATTR(SW, sw); static struct attribute *input_dev_caps_attrs[] = { &dev_attr_ev.attr, &dev_attr_key.attr, &dev_attr_rel.attr, &dev_attr_abs.attr, &dev_attr_msc.attr, &dev_attr_led.attr, &dev_attr_snd.attr, &dev_attr_ff.attr, &dev_attr_sw.attr, NULL }; static const struct attribute_group input_dev_caps_attr_group = { .name = "capabilities", .attrs = input_dev_caps_attrs, }; static const struct attribute_group *input_dev_attr_groups[] = { &input_dev_attr_group, &input_dev_id_attr_group, &input_dev_caps_attr_group, &input_poller_attribute_group, NULL }; static void input_dev_release(struct device *device) { struct input_dev *dev = to_input_dev(device); input_ff_destroy(dev); input_mt_destroy_slots(dev); kfree(dev->poller); kfree(dev->absinfo); kfree(dev->vals); kfree(dev); module_put(THIS_MODULE); } /* * Input uevent interface - loading event handlers based on * device bitfields. */ static int input_add_uevent_bm_var(struct kobj_uevent_env *env, const char *name, const unsigned long *bitmap, int max) { int len; if (add_uevent_var(env, "%s", name)) return -ENOMEM; len = input_print_bitmap(&env->buf[env->buflen - 1], sizeof(env->buf) - env->buflen, bitmap, max, false); if (len >= (sizeof(env->buf) - env->buflen)) return -ENOMEM; env->buflen += len; return 0; } /* * This is a pretty gross hack. When building uevent data the driver core * may try adding more environment variables to kobj_uevent_env without * telling us, so we have no idea how much of the buffer we can use to * avoid overflows/-ENOMEM elsewhere. To work around this let's artificially * reduce amount of memory we will use for the modalias environment variable. * * The potential additions are: * * SEQNUM=18446744073709551615 - (%llu - 28 bytes) * HOME=/ (6 bytes) * PATH=/sbin:/bin:/usr/sbin:/usr/bin (34 bytes) * * 68 bytes total. Allow extra buffer - 96 bytes */ #define UEVENT_ENV_EXTRA_LEN 96 static int input_add_uevent_modalias_var(struct kobj_uevent_env *env, const struct input_dev *dev) { int len; if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; len = input_print_modalias(&env->buf[env->buflen - 1], (int)sizeof(env->buf) - env->buflen - UEVENT_ENV_EXTRA_LEN, dev); if (len >= ((int)sizeof(env->buf) - env->buflen - UEVENT_ENV_EXTRA_LEN)) return -ENOMEM; env->buflen += len; return 0; } #define INPUT_ADD_HOTPLUG_VAR(fmt, val...) \ do { \ int err = add_uevent_var(env, fmt, val); \ if (err) \ return err; \ } while (0) #define INPUT_ADD_HOTPLUG_BM_VAR(name, bm, max) \ do { \ int err = input_add_uevent_bm_var(env, name, bm, max); \ if (err) \ return err; \ } while (0) #define INPUT_ADD_HOTPLUG_MODALIAS_VAR(dev) \ do { \ int err = input_add_uevent_modalias_var(env, dev); \ if (err) \ return err; \ } while (0) static int input_dev_uevent(const struct device *device, struct kobj_uevent_env *env) { const struct input_dev *dev = to_input_dev(device); INPUT_ADD_HOTPLUG_VAR("PRODUCT=%x/%x/%x/%x", dev->id.bustype, dev->id.vendor, dev->id.product, dev->id.version); if (dev->name) INPUT_ADD_HOTPLUG_VAR("NAME=\"%s\"", dev->name); if (dev->phys) INPUT_ADD_HOTPLUG_VAR("PHYS=\"%s\"", dev->phys); if (dev->uniq) INPUT_ADD_HOTPLUG_VAR("UNIQ=\"%s\"", dev->uniq); INPUT_ADD_HOTPLUG_BM_VAR("PROP=", dev->propbit, INPUT_PROP_MAX); INPUT_ADD_HOTPLUG_BM_VAR("EV=", dev->evbit, EV_MAX); if (test_bit(EV_KEY, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("KEY=", dev->keybit, KEY_MAX); if (test_bit(EV_REL, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("REL=", dev->relbit, REL_MAX); if (test_bit(EV_ABS, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("ABS=", dev->absbit, ABS_MAX); if (test_bit(EV_MSC, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("MSC=", dev->mscbit, MSC_MAX); if (test_bit(EV_LED, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("LED=", dev->ledbit, LED_MAX); if (test_bit(EV_SND, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("SND=", dev->sndbit, SND_MAX); if (test_bit(EV_FF, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("FF=", dev->ffbit, FF_MAX); if (test_bit(EV_SW, dev->evbit)) INPUT_ADD_HOTPLUG_BM_VAR("SW=", dev->swbit, SW_MAX); INPUT_ADD_HOTPLUG_MODALIAS_VAR(dev); return 0; } #define INPUT_DO_TOGGLE(dev, type, bits, on) \ do { \ int i; \ bool active; \ \ if (!test_bit(EV_##type, dev->evbit)) \ break; \ \ for_each_set_bit(i, dev->bits##bit, type##_CNT) { \ active = test_bit(i, dev->bits); \ if (!active && !on) \ continue; \ \ dev->event(dev, EV_##type, i, on ? active : 0); \ } \ } while (0) static void input_dev_toggle(struct input_dev *dev, bool activate) { if (!dev->event) return; INPUT_DO_TOGGLE(dev, LED, led, activate); INPUT_DO_TOGGLE(dev, SND, snd, activate); if (activate && test_bit(EV_REP, dev->evbit)) { dev->event(dev, EV_REP, REP_PERIOD, dev->rep[REP_PERIOD]); dev->event(dev, EV_REP, REP_DELAY, dev->rep[REP_DELAY]); } } /** * input_reset_device() - reset/restore the state of input device * @dev: input device whose state needs to be reset * * This function tries to reset the state of an opened input device and * bring internal state and state if the hardware in sync with each other. * We mark all keys as released, restore LED state, repeat rate, etc. */ void input_reset_device(struct input_dev *dev) { guard(mutex)(&dev->mutex); guard(spinlock_irqsave)(&dev->event_lock); input_dev_toggle(dev, true); if (input_dev_release_keys(dev)) input_handle_event(dev, EV_SYN, SYN_REPORT, 1); } EXPORT_SYMBOL(input_reset_device); static int input_inhibit_device(struct input_dev *dev) { guard(mutex)(&dev->mutex); if (dev->inhibited) return 0; if (dev->users) { if (dev->close) dev->close(dev); if (dev->poller) input_dev_poller_stop(dev->poller); } scoped_guard(spinlock_irq, &dev->event_lock) { input_mt_release_slots(dev); input_dev_release_keys(dev); input_handle_event(dev, EV_SYN, SYN_REPORT, 1); input_dev_toggle(dev, false); } dev->inhibited = true; return 0; } static int input_uninhibit_device(struct input_dev *dev) { int error; guard(mutex)(&dev->mutex); if (!dev->inhibited) return 0; if (dev->users) { if (dev->open) { error = dev->open(dev); if (error) return error; } if (dev->poller) input_dev_poller_start(dev->poller); } dev->inhibited = false; scoped_guard(spinlock_irq, &dev->event_lock) input_dev_toggle(dev, true); return 0; } static int input_dev_suspend(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* * Keys that are pressed now are unlikely to be * still pressed when we resume. */ if (input_dev_release_keys(input_dev)) input_handle_event(input_dev, EV_SYN, SYN_REPORT, 1); /* Turn off LEDs and sounds, if any are active. */ input_dev_toggle(input_dev, false); return 0; } static int input_dev_resume(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* Restore state of LEDs and sounds, if any were active. */ input_dev_toggle(input_dev, true); return 0; } static int input_dev_freeze(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* * Keys that are pressed now are unlikely to be * still pressed when we resume. */ if (input_dev_release_keys(input_dev)) input_handle_event(input_dev, EV_SYN, SYN_REPORT, 1); return 0; } static int input_dev_poweroff(struct device *dev) { struct input_dev *input_dev = to_input_dev(dev); guard(spinlock_irq)(&input_dev->event_lock); /* Turn off LEDs and sounds, if any are active. */ input_dev_toggle(input_dev, false); return 0; } static const struct dev_pm_ops input_dev_pm_ops = { .suspend = input_dev_suspend, .resume = input_dev_resume, .freeze = input_dev_freeze, .poweroff = input_dev_poweroff, .restore = input_dev_resume, }; static const struct device_type input_dev_type = { .groups = input_dev_attr_groups, .release = input_dev_release, .uevent = input_dev_uevent, .pm = pm_sleep_ptr(&input_dev_pm_ops), }; static char *input_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "input/%s", dev_name(dev)); } const struct class input_class = { .name = "input", .devnode = input_devnode, }; EXPORT_SYMBOL_GPL(input_class); /** * input_allocate_device - allocate memory for new input device * * Returns prepared struct input_dev or %NULL. * * NOTE: Use input_free_device() to free devices that have not been * registered; input_unregister_device() should be used for already * registered devices. */ struct input_dev *input_allocate_device(void) { static atomic_t input_no = ATOMIC_INIT(-1); struct input_dev *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; /* * Start with space for SYN_REPORT + 7 EV_KEY/EV_MSC events + 2 spare, * see input_estimate_events_per_packet(). We will tune the number * when we register the device. */ dev->max_vals = 10; dev->vals = kcalloc(dev->max_vals, sizeof(*dev->vals), GFP_KERNEL); if (!dev->vals) { kfree(dev); return NULL; } mutex_init(&dev->mutex); spin_lock_init(&dev->event_lock); timer_setup(&dev->timer, NULL, 0); INIT_LIST_HEAD(&dev->h_list); INIT_LIST_HEAD(&dev->node); dev->dev.type = &input_dev_type; dev->dev.class = &input_class; device_initialize(&dev->dev); /* * From this point on we can no longer simply "kfree(dev)", we need * to use input_free_device() so that device core properly frees its * resources associated with the input device. */ dev_set_name(&dev->dev, "input%lu", (unsigned long)atomic_inc_return(&input_no)); __module_get(THIS_MODULE); return dev; } EXPORT_SYMBOL(input_allocate_device); struct input_devres { struct input_dev *input; }; static int devm_input_device_match(struct device *dev, void *res, void *data) { struct input_devres *devres = res; return devres->input == data; } static void devm_input_device_release(struct device *dev, void *res) { struct input_devres *devres = res; struct input_dev *input = devres->input; dev_dbg(dev, "%s: dropping reference to %s\n", __func__, dev_name(&input->dev)); input_put_device(input); } /** * devm_input_allocate_device - allocate managed input device * @dev: device owning the input device being created * * Returns prepared struct input_dev or %NULL. * * Managed input devices do not need to be explicitly unregistered or * freed as it will be done automatically when owner device unbinds from * its driver (or binding fails). Once managed input device is allocated, * it is ready to be set up and registered in the same fashion as regular * input device. There are no special devm_input_device_[un]register() * variants, regular ones work with both managed and unmanaged devices, * should you need them. In most cases however, managed input device need * not be explicitly unregistered or freed. * * NOTE: the owner device is set up as parent of input device and users * should not override it. */ struct input_dev *devm_input_allocate_device(struct device *dev) { struct input_dev *input; struct input_devres *devres; devres = devres_alloc(devm_input_device_release, sizeof(*devres), GFP_KERNEL); if (!devres) return NULL; input = input_allocate_device(); if (!input) { devres_free(devres); return NULL; } input->dev.parent = dev; input->devres_managed = true; devres->input = input; devres_add(dev, devres); return input; } EXPORT_SYMBOL(devm_input_allocate_device); /** * input_free_device - free memory occupied by input_dev structure * @dev: input device to free * * This function should only be used if input_register_device() * was not called yet or if it failed. Once device was registered * use input_unregister_device() and memory will be freed once last * reference to the device is dropped. * * Device should be allocated by input_allocate_device(). * * NOTE: If there are references to the input device then memory * will not be freed until last reference is dropped. */ void input_free_device(struct input_dev *dev) { if (dev) { if (dev->devres_managed) WARN_ON(devres_destroy(dev->dev.parent, devm_input_device_release, devm_input_device_match, dev)); input_put_device(dev); } } EXPORT_SYMBOL(input_free_device); /** * input_set_timestamp - set timestamp for input events * @dev: input device to set timestamp for * @timestamp: the time at which the event has occurred * in CLOCK_MONOTONIC * * This function is intended to provide to the input system a more * accurate time of when an event actually occurred. The driver should * call this function as soon as a timestamp is acquired ensuring * clock conversions in input_set_timestamp are done correctly. * * The system entering suspend state between timestamp acquisition and * calling input_set_timestamp can result in inaccurate conversions. */ void input_set_timestamp(struct input_dev *dev, ktime_t timestamp) { dev->timestamp[INPUT_CLK_MONO] = timestamp; dev->timestamp[INPUT_CLK_REAL] = ktime_mono_to_real(timestamp); dev->timestamp[INPUT_CLK_BOOT] = ktime_mono_to_any(timestamp, TK_OFFS_BOOT); } EXPORT_SYMBOL(input_set_timestamp); /** * input_get_timestamp - get timestamp for input events * @dev: input device to get timestamp from * * A valid timestamp is a timestamp of non-zero value. */ ktime_t *input_get_timestamp(struct input_dev *dev) { const ktime_t invalid_timestamp = ktime_set(0, 0); if (!ktime_compare(dev->timestamp[INPUT_CLK_MONO], invalid_timestamp)) input_set_timestamp(dev, ktime_get()); return dev->timestamp; } EXPORT_SYMBOL(input_get_timestamp); /** * input_set_capability - mark device as capable of a certain event * @dev: device that is capable of emitting or accepting event * @type: type of the event (EV_KEY, EV_REL, etc...) * @code: event code * * In addition to setting up corresponding bit in appropriate capability * bitmap the function also adjusts dev->evbit. */ void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code) { if (type < EV_CNT && input_max_code[type] && code > input_max_code[type]) { pr_err("%s: invalid code %u for type %u\n", __func__, code, type); dump_stack(); return; } switch (type) { case EV_KEY: __set_bit(code, dev->keybit); break; case EV_REL: __set_bit(code, dev->relbit); break; case EV_ABS: input_alloc_absinfo(dev); __set_bit(code, dev->absbit); break; case EV_MSC: __set_bit(code, dev->mscbit); break; case EV_SW: __set_bit(code, dev->swbit); break; case EV_LED: __set_bit(code, dev->ledbit); break; case EV_SND: __set_bit(code, dev->sndbit); break; case EV_FF: __set_bit(code, dev->ffbit); break; case EV_PWR: /* do nothing */ break; default: pr_err("%s: unknown type %u (code %u)\n", __func__, type, code); dump_stack(); return; } __set_bit(type, dev->evbit); } EXPORT_SYMBOL(input_set_capability); static unsigned int input_estimate_events_per_packet(struct input_dev *dev) { int mt_slots; int i; unsigned int events; if (dev->mt) { mt_slots = dev->mt->num_slots; } else if (test_bit(ABS_MT_TRACKING_ID, dev->absbit)) { mt_slots = dev->absinfo[ABS_MT_TRACKING_ID].maximum - dev->absinfo[ABS_MT_TRACKING_ID].minimum + 1; mt_slots = clamp(mt_slots, 2, 32); } else if (test_bit(ABS_MT_POSITION_X, dev->absbit)) { mt_slots = 2; } else { mt_slots = 0; } events = mt_slots + 1; /* count SYN_MT_REPORT and SYN_REPORT */ if (test_bit(EV_ABS, dev->evbit)) for_each_set_bit(i, dev->absbit, ABS_CNT) events += input_is_mt_axis(i) ? mt_slots : 1; if (test_bit(EV_REL, dev->evbit)) events += bitmap_weight(dev->relbit, REL_CNT); /* Make room for KEY and MSC events */ events += 7; return events; } #define INPUT_CLEANSE_BITMASK(dev, type, bits) \ do { \ if (!test_bit(EV_##type, dev->evbit)) \ memset(dev->bits##bit, 0, \ sizeof(dev->bits##bit)); \ } while (0) static void input_cleanse_bitmasks(struct input_dev *dev) { INPUT_CLEANSE_BITMASK(dev, KEY, key); INPUT_CLEANSE_BITMASK(dev, REL, rel); INPUT_CLEANSE_BITMASK(dev, ABS, abs); INPUT_CLEANSE_BITMASK(dev, MSC, msc); INPUT_CLEANSE_BITMASK(dev, LED, led); INPUT_CLEANSE_BITMASK(dev, SND, snd); INPUT_CLEANSE_BITMASK(dev, FF, ff); INPUT_CLEANSE_BITMASK(dev, SW, sw); } static void __input_unregister_device(struct input_dev *dev) { struct input_handle *handle, *next; input_disconnect_device(dev); scoped_guard(mutex, &input_mutex) { list_for_each_entry_safe(handle, next, &dev->h_list, d_node) handle->handler->disconnect(handle); WARN_ON(!list_empty(&dev->h_list)); timer_delete_sync(&dev->timer); list_del_init(&dev->node); input_wakeup_procfs_readers(); } device_del(&dev->dev); } static void devm_input_device_unregister(struct device *dev, void *res) { struct input_devres *devres = res; struct input_dev *input = devres->input; dev_dbg(dev, "%s: unregistering device %s\n", __func__, dev_name(&input->dev)); __input_unregister_device(input); } /* * Generate software autorepeat event. Note that we take * dev->event_lock here to avoid racing with input_event * which may cause keys get "stuck". */ static void input_repeat_key(struct timer_list *t) { struct input_dev *dev = timer_container_of(dev, t, timer); guard(spinlock_irqsave)(&dev->event_lock); if (!dev->inhibited && test_bit(dev->repeat_key, dev->key) && is_event_supported(dev->repeat_key, dev->keybit, KEY_MAX)) { input_set_timestamp(dev, ktime_get()); input_handle_event(dev, EV_KEY, dev->repeat_key, 2); input_handle_event(dev, EV_SYN, SYN_REPORT, 1); if (dev->rep[REP_PERIOD]) mod_timer(&dev->timer, jiffies + msecs_to_jiffies(dev->rep[REP_PERIOD])); } } /** * input_enable_softrepeat - enable software autorepeat * @dev: input device * @delay: repeat delay * @period: repeat period * * Enable software autorepeat on the input device. */ void input_enable_softrepeat(struct input_dev *dev, int delay, int period) { dev->timer.function = input_repeat_key; dev->rep[REP_DELAY] = delay; dev->rep[REP_PERIOD] = period; } EXPORT_SYMBOL(input_enable_softrepeat); bool input_device_enabled(struct input_dev *dev) { lockdep_assert_held(&dev->mutex); return !dev->inhibited && dev->users > 0; } EXPORT_SYMBOL_GPL(input_device_enabled); static int input_device_tune_vals(struct input_dev *dev) { struct input_value *vals; unsigned int packet_size; unsigned int max_vals; packet_size = input_estimate_events_per_packet(dev); if (dev->hint_events_per_packet < packet_size) dev->hint_events_per_packet = packet_size; max_vals = dev->hint_events_per_packet + 2; if (dev->max_vals >= max_vals) return 0; vals = kcalloc(max_vals, sizeof(*vals), GFP_KERNEL); if (!vals) return -ENOMEM; scoped_guard(spinlock_irq, &dev->event_lock) { dev->max_vals = max_vals; swap(dev->vals, vals); } /* Because of swap() above, this frees the old vals memory */ kfree(vals); return 0; } /** * input_register_device - register device with input core * @dev: device to be registered * * This function registers device with input core. The device must be * allocated with input_allocate_device() and all it's capabilities * set up before registering. * If function fails the device must be freed with input_free_device(). * Once device has been successfully registered it can be unregistered * with input_unregister_device(); input_free_device() should not be * called in this case. * * Note that this function is also used to register managed input devices * (ones allocated with devm_input_allocate_device()). Such managed input * devices need not be explicitly unregistered or freed, their tear down * is controlled by the devres infrastructure. It is also worth noting * that tear down of managed input devices is internally a 2-step process: * registered managed input device is first unregistered, but stays in * memory and can still handle input_event() calls (although events will * not be delivered anywhere). The freeing of managed input device will * happen later, when devres stack is unwound to the point where device * allocation was made. */ int input_register_device(struct input_dev *dev) { struct input_devres *devres = NULL; struct input_handler *handler; const char *path; int error; if (test_bit(EV_ABS, dev->evbit) && !dev->absinfo) { dev_err(&dev->dev, "Absolute device without dev->absinfo, refusing to register\n"); return -EINVAL; } if (dev->devres_managed) { devres = devres_alloc(devm_input_device_unregister, sizeof(*devres), GFP_KERNEL); if (!devres) return -ENOMEM; devres->input = dev; } /* Every input device generates EV_SYN/SYN_REPORT events. */ __set_bit(EV_SYN, dev->evbit); /* KEY_RESERVED is not supposed to be transmitted to userspace. */ __clear_bit(KEY_RESERVED, dev->keybit); /* Make sure that bitmasks not mentioned in dev->evbit are clean. */ input_cleanse_bitmasks(dev); error = input_device_tune_vals(dev); if (error) goto err_devres_free; /* * If delay and period are pre-set by the driver, then autorepeating * is handled by the driver itself and we don't do it in input.c. */ if (!dev->rep[REP_DELAY] && !dev->rep[REP_PERIOD]) input_enable_softrepeat(dev, 250, 33); if (!dev->getkeycode) dev->getkeycode = input_default_getkeycode; if (!dev->setkeycode) dev->setkeycode = input_default_setkeycode; if (dev->poller) input_dev_poller_finalize(dev->poller); error = device_add(&dev->dev); if (error) goto err_devres_free; path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); pr_info("%s as %s\n", dev->name ? dev->name : "Unspecified device", path ? path : "N/A"); kfree(path); error = -EINTR; scoped_cond_guard(mutex_intr, goto err_device_del, &input_mutex) { list_add_tail(&dev->node, &input_dev_list); list_for_each_entry(handler, &input_handler_list, node) input_attach_handler(dev, handler); input_wakeup_procfs_readers(); } if (dev->devres_managed) { dev_dbg(dev->dev.parent, "%s: registering %s with devres.\n", __func__, dev_name(&dev->dev)); devres_add(dev->dev.parent, devres); } return 0; err_device_del: device_del(&dev->dev); err_devres_free: devres_free(devres); return error; } EXPORT_SYMBOL(input_register_device); /** * input_unregister_device - unregister previously registered device * @dev: device to be unregistered * * This function unregisters an input device. Once device is unregistered * the caller should not try to access it as it may get freed at any moment. */ void input_unregister_device(struct input_dev *dev) { if (dev->devres_managed) { WARN_ON(devres_destroy(dev->dev.parent, devm_input_device_unregister, devm_input_device_match, dev)); __input_unregister_device(dev); /* * We do not do input_put_device() here because it will be done * when 2nd devres fires up. */ } else { __input_unregister_device(dev); input_put_device(dev); } } EXPORT_SYMBOL(input_unregister_device); static int input_handler_check_methods(const struct input_handler *handler) { int count = 0; if (handler->filter) count++; if (handler->events) count++; if (handler->event) count++; if (count > 1) { pr_err("%s: only one event processing method can be defined (%s)\n", __func__, handler->name); return -EINVAL; } return 0; } /** * input_register_handler - register a new input handler * @handler: handler to be registered * * This function registers a new input handler (interface) for input * devices in the system and attaches it to all input devices that * are compatible with the handler. */ int input_register_handler(struct input_handler *handler) { struct input_dev *dev; int error; error = input_handler_check_methods(handler); if (error) return error; scoped_cond_guard(mutex_intr, return -EINTR, &input_mutex) { INIT_LIST_HEAD(&handler->h_list); list_add_tail(&handler->node, &input_handler_list); list_for_each_entry(dev, &input_dev_list, node) input_attach_handler(dev, handler); input_wakeup_procfs_readers(); } return 0; } EXPORT_SYMBOL(input_register_handler); /** * input_unregister_handler - unregisters an input handler * @handler: handler to be unregistered * * This function disconnects a handler from its input devices and * removes it from lists of known handlers. */ void input_unregister_handler(struct input_handler *handler) { struct input_handle *handle, *next; guard(mutex)(&input_mutex); list_for_each_entry_safe(handle, next, &handler->h_list, h_node) handler->disconnect(handle); WARN_ON(!list_empty(&handler->h_list)); list_del_init(&handler->node); input_wakeup_procfs_readers(); } EXPORT_SYMBOL(input_unregister_handler); /** * input_handler_for_each_handle - handle iterator * @handler: input handler to iterate * @data: data for the callback * @fn: function to be called for each handle * * Iterate over @bus's list of devices, and call @fn for each, passing * it @data and stop when @fn returns a non-zero value. The function is * using RCU to traverse the list and therefore may be using in atomic * contexts. The @fn callback is invoked from RCU critical section and * thus must not sleep. */ int input_handler_for_each_handle(struct input_handler *handler, void *data, int (*fn)(struct input_handle *, void *)) { struct input_handle *handle; int retval; guard(rcu)(); list_for_each_entry_rcu(handle, &handler->h_list, h_node) { retval = fn(handle, data); if (retval) return retval; } return 0; } EXPORT_SYMBOL(input_handler_for_each_handle); /* * An implementation of input_handle's handle_events() method that simply * invokes handler->event() method for each event one by one. */ static unsigned int input_handle_events_default(struct input_handle *handle, struct input_value *vals, unsigned int count) { struct input_handler *handler = handle->handler; struct input_value *v; for (v = vals; v != vals + count; v++) handler->event(handle, v->type, v->code, v->value); return count; } /* * An implementation of input_handle's handle_events() method that invokes * handler->filter() method for each event one by one and removes events * that were filtered out from the "vals" array. */ static unsigned int input_handle_events_filter(struct input_handle *handle, struct input_value *vals, unsigned int count) { struct input_handler *handler = handle->handler; struct input_value *end = vals; struct input_value *v; for (v = vals; v != vals + count; v++) { if (handler->filter(handle, v->type, v->code, v->value)) continue; if (end != v) *end = *v; end++; } return end - vals; } /* * An implementation of input_handle's handle_events() method that does nothing. */ static unsigned int input_handle_events_null(struct input_handle *handle, struct input_value *vals, unsigned int count) { return count; } /* * Sets up appropriate handle->event_handler based on the input_handler * associated with the handle. */ static void input_handle_setup_event_handler(struct input_handle *handle) { struct input_handler *handler = handle->handler; if (handler->filter) handle->handle_events = input_handle_events_filter; else if (handler->event) handle->handle_events = input_handle_events_default; else if (handler->events) handle->handle_events = handler->events; else handle->handle_events = input_handle_events_null; } /** * input_register_handle - register a new input handle * @handle: handle to register * * This function puts a new input handle onto device's * and handler's lists so that events can flow through * it once it is opened using input_open_device(). * * This function is supposed to be called from handler's * connect() method. */ int input_register_handle(struct input_handle *handle) { struct input_handler *handler = handle->handler; struct input_dev *dev = handle->dev; input_handle_setup_event_handler(handle); /* * We take dev->mutex here to prevent race with * input_release_device(). */ scoped_cond_guard(mutex_intr, return -EINTR, &dev->mutex) { /* * Filters go to the head of the list, normal handlers * to the tail. */ if (handler->filter) list_add_rcu(&handle->d_node, &dev->h_list); else list_add_tail_rcu(&handle->d_node, &dev->h_list); } /* * Since we are supposed to be called from ->connect() * which is mutually exclusive with ->disconnect() * we can't be racing with input_unregister_handle() * and so separate lock is not needed here. */ list_add_tail_rcu(&handle->h_node, &handler->h_list); if (handler->start) handler->start(handle); return 0; } EXPORT_SYMBOL(input_register_handle); /** * input_unregister_handle - unregister an input handle * @handle: handle to unregister * * This function removes input handle from device's * and handler's lists. * * This function is supposed to be called from handler's * disconnect() method. */ void input_unregister_handle(struct input_handle *handle) { struct input_dev *dev = handle->dev; list_del_rcu(&handle->h_node); /* * Take dev->mutex to prevent race with input_release_device(). */ scoped_guard(mutex, &dev->mutex) list_del_rcu(&handle->d_node); synchronize_rcu(); } EXPORT_SYMBOL(input_unregister_handle); /** * input_get_new_minor - allocates a new input minor number * @legacy_base: beginning or the legacy range to be searched * @legacy_num: size of legacy range * @allow_dynamic: whether we can also take ID from the dynamic range * * This function allocates a new device minor for from input major namespace. * Caller can request legacy minor by specifying @legacy_base and @legacy_num * parameters and whether ID can be allocated from dynamic range if there are * no free IDs in legacy range. */ int input_get_new_minor(int legacy_base, unsigned int legacy_num, bool allow_dynamic) { /* * This function should be called from input handler's ->connect() * methods, which are serialized with input_mutex, so no additional * locking is needed here. */ if (legacy_base >= 0) { int minor = ida_alloc_range(&input_ida, legacy_base, legacy_base + legacy_num - 1, GFP_KERNEL); if (minor >= 0 || !allow_dynamic) return minor; } return ida_alloc_range(&input_ida, INPUT_FIRST_DYNAMIC_DEV, INPUT_MAX_CHAR_DEVICES - 1, GFP_KERNEL); } EXPORT_SYMBOL(input_get_new_minor); /** * input_free_minor - release previously allocated minor * @minor: minor to be released * * This function releases previously allocated input minor so that it can be * reused later. */ void input_free_minor(unsigned int minor) { ida_free(&input_ida, minor); } EXPORT_SYMBOL(input_free_minor); static int __init input_init(void) { int err; err = class_register(&input_class); if (err) { pr_err("unable to register input_dev class\n"); return err; } err = input_proc_init(); if (err) goto fail1; err = register_chrdev_region(MKDEV(INPUT_MAJOR, 0), INPUT_MAX_CHAR_DEVICES, "input"); if (err) { pr_err("unable to register char major %d", INPUT_MAJOR); goto fail2; } return 0; fail2: input_proc_exit(); fail1: class_unregister(&input_class); return err; } static void __exit input_exit(void) { input_proc_exit(); unregister_chrdev_region(MKDEV(INPUT_MAJOR, 0), INPUT_MAX_CHAR_DEVICES); class_unregister(&input_class); } subsys_initcall(input_init); module_exit(input_exit);
410 514 6 406 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BUF_H__ #define __XFS_BUF_H__ #include <linux/list.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/dax.h> #include <linux/uio.h> #include <linux/list_lru.h> extern struct kmem_cache *xfs_buf_cache; /* * Base types */ struct xfs_buf; #define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX) #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) #define XBF_READ (1u << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ #define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ #define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ #define XBF_WRITE_FAIL (1u << 7) /* async writes have failed on this buffer */ /* buffer type flags for write callbacks */ #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ #define _XBF_KMEM (1u << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ /* * Online fsck is scanning the buffer cache for live buffers. Do not warn * about length mismatches during lookups and do not return stale buffers. */ #define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ struct xfs_buf_cache { struct rhashtable bc_hash; }; int xfs_buf_cache_init(struct xfs_buf_cache *bch); void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); /* * The xfs_buftarg contains 2 notions of "sector size" - * * 1) The metadata sector size, which is the minimum unit and * alignment of IO which will be performed by metadata operations. * 2) The device logical sector size * * The first is specified at mkfs time, and is stored on-disk in the * superblock's sb_sectsize. * * The latter is derived from the underlying device, and controls direct IO * alignment constraints. */ struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct dax_device *bt_daxdev; struct file *bt_file; u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; xfs_daddr_t bt_nr_sectors; /* LRU control structures */ struct shrinker *bt_shrinker; struct list_lru bt_lru; struct percpu_counter bt_readahead_count; struct ratelimit_state bt_ioerror_rl; /* Hardware atomic write unit values, bytes */ unsigned int bt_awu_min; unsigned int bt_awu_max; /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; }; struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ unsigned int bm_flags; }; /* * Online fsck is scanning the buffer cache for live buffers. Do not warn * about length mismatches during lookups and do not return stale buffers. */ #define XBM_LIVESCAN (1U << 0) #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; struct xfs_buf_ops { char *name; union { __be32 magic[2]; /* v4 and v5 on disk magic values */ __be16 magic16[2]; /* v4 and v5 on disk magic values */ }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); }; struct xfs_buf { /* * first cacheline holds all the fields needed for an uncontended cache * hit to be fully processed. The semaphore straddles the cacheline * boundary, but the counter and lock sits on the first cacheline, * which is the only bit that is touched if we hit the semaphore * fast-path on locking. */ struct rhash_head b_rhash_head; /* pag buffer hash node */ xfs_daddr_t b_rhash_key; /* buffer cache index */ int b_length; /* size of buffer in BBs */ unsigned int b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ /* * concurrent access to b_lru and b_lru_flags are protected by * bt_lru_lock and not by b_sema */ struct list_head b_lru; /* lru list */ spinlock_t b_lock; /* internal state lock */ unsigned int b_state; /* internal state flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; struct xfs_mount *b_mount; struct xfs_buftarg *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; struct completion b_iowait; /* queue for I/O waiters */ struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ int b_error; /* error code on I/O */ void (*b_iodone)(struct xfs_buf *bp); /* * async write failure retry count. Initialised to zero on the first * failure, then when it exceeds the maximum configured without a * success the write is considered to be failed permanently and the * iodone handler will take appropriate action. * * For retry timeouts, we record the jiffy of the first failure. This * means that we can change the retry timeout for buffers already under * I/O and thus avoid getting stuck in a retry loop with a long timeout. * * last_error is used to ensure that we are getting repeated errors, not * different errors. e.g. a block device might change ENOSPC to EIO when * a failure timeout occurs, so we want to re-initialise the error * retry behaviour appropriately when that happens. */ int b_retries; unsigned long b_first_retry_time; /* in jiffies */ int b_last_error; const struct xfs_buf_ops *b_ops; struct rcu_head b_rcu; }; /* Finding and Reading Buffers */ int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops, xfs_failaddr_t fa); void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, const struct xfs_buf_ops *ops); static inline int xfs_buf_incore( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp); } static inline int xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, struct xfs_buf **bpp) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_get_map(target, &map, 1, 0, bpp); } static inline int xfs_buf_read( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_read_map(target, &map, 1, flags, bpp, ops, __builtin_return_address(0)); } static inline void xfs_buf_readahead( struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, const struct xfs_buf_ops *ops) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); return xfs_buf_readahead_map(target, &map, 1, ops); } int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); int _xfs_buf_read(struct xfs_buf *bp); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ extern void xfs_buf_rele(struct xfs_buf *); /* Locking and Unlocking Buffers */ extern int xfs_buf_trylock(struct xfs_buf *); extern void xfs_buf_lock(struct xfs_buf *); extern void xfs_buf_unlock(struct xfs_buf *); #define xfs_buf_islocked(bp) \ ((bp)->b_sema.count <= 0) static inline void xfs_buf_relse(struct xfs_buf *bp) { xfs_buf_unlock(bp); xfs_buf_rele(bp); } /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset) { return bp->b_addr + offset; } static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize) { memset(bp->b_addr + boff, 0, bsize); } extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { return bp->b_maps[0].bm_bn; } void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); /* * If the buffer is already on the LRU, do nothing. Otherwise set the buffer * up with a reference count of 0 so it will be tossed from the cache when * released. */ static inline void xfs_buf_oneshot(struct xfs_buf *bp) { if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1) return; atomic_set(&bp->b_lru_ref, 0); } static inline int xfs_buf_ispinned(struct xfs_buf *bp) { return atomic_read(&bp->b_pin_count); } static inline int xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) { return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), cksum_offset); } static inline void xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) { xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), cksum_offset); } /* * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, struct file *bdev_file); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize, xfs_fsblock_t nr_blocks); #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); /* for xfs_buf_mem.c only: */ int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize, const char *descr); void xfs_destroy_buftarg(struct xfs_buftarg *btp); #endif /* __XFS_BUF_H__ */
7 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <net/ipv6.h> #if IS_ENABLED(CONFIG_IPV6) #if !IS_BUILTIN(CONFIG_IPV6) static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { int ret; ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? 0 : -EINVAL; synchronize_net(); return ret; } EXPORT_SYMBOL(inet6_unregister_icmp_sender); void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmpv6_ndo_send); #endif #endif
17 1 18 1 18 18 18 17 17 18 1 6 2 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 // SPDX-License-Identifier: GPL-2.0-only #include <linux/bcd.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/mc146818rtc.h> #ifdef CONFIG_ACPI #include <linux/acpi.h> #endif #define UIP_RECHECK_DELAY 100 /* usec */ #define UIP_RECHECK_DELAY_MS (USEC_PER_MSEC / UIP_RECHECK_DELAY) #define UIP_RECHECK_LOOPS_MS(x) (x / UIP_RECHECK_DELAY_MS) /* * Execute a function while the UIP (Update-in-progress) bit of the RTC is * unset. The timeout is configurable by the caller in ms. * * Warning: callback may be executed more then once. */ bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), int timeout, void *param) { int i; unsigned long flags; unsigned char seconds; for (i = 0; UIP_RECHECK_LOOPS_MS(i) < timeout; i++) { spin_lock_irqsave(&rtc_lock, flags); /* * Check whether there is an update in progress during which the * readout is unspecified. The maximum update time is ~2ms. Poll * for completion. * * Store the second value before checking UIP so a long lasting * NMI which happens to hit after the UIP check cannot make * an update cycle invisible. */ seconds = CMOS_READ(RTC_SECONDS); if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { spin_unlock_irqrestore(&rtc_lock, flags); udelay(UIP_RECHECK_DELAY); continue; } /* Revalidate the above readout */ if (seconds != CMOS_READ(RTC_SECONDS)) { spin_unlock_irqrestore(&rtc_lock, flags); continue; } if (callback) callback(seconds, param); /* * Check for the UIP bit again. If it is set now then * the above values may contain garbage. */ if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { spin_unlock_irqrestore(&rtc_lock, flags); udelay(UIP_RECHECK_DELAY); continue; } /* * A NMI might have interrupted the above sequence so check * whether the seconds value has changed which indicates that * the NMI took longer than the UIP bit was set. Unlikely, but * possible and there is also virt... */ if (seconds != CMOS_READ(RTC_SECONDS)) { spin_unlock_irqrestore(&rtc_lock, flags); continue; } spin_unlock_irqrestore(&rtc_lock, flags); if (UIP_RECHECK_LOOPS_MS(i) >= 100) pr_warn("Reading current time from RTC took around %li ms\n", UIP_RECHECK_LOOPS_MS(i)); return true; } return false; } EXPORT_SYMBOL_GPL(mc146818_avoid_UIP); /* * If the UIP (Update-in-progress) bit of the RTC is set for more then * 10ms, the RTC is apparently broken or not present. */ bool mc146818_does_rtc_work(void) { return mc146818_avoid_UIP(NULL, 1000, NULL); } EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); struct mc146818_get_time_callback_param { struct rtc_time *time; unsigned char ctrl; #ifdef CONFIG_ACPI unsigned char century; #endif #ifdef CONFIG_MACH_DECSTATION unsigned int real_year; #endif }; static void mc146818_get_time_callback(unsigned char seconds, void *param_in) { struct mc146818_get_time_callback_param *p = param_in; /* * Only the values that we read from the RTC are set. We leave * tm_wday, tm_yday and tm_isdst untouched. Even though the * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated * by the RTC when initially set to a non-zero value. */ p->time->tm_sec = seconds; p->time->tm_min = CMOS_READ(RTC_MINUTES); p->time->tm_hour = CMOS_READ(RTC_HOURS); p->time->tm_mday = CMOS_READ(RTC_DAY_OF_MONTH); p->time->tm_mon = CMOS_READ(RTC_MONTH); p->time->tm_year = CMOS_READ(RTC_YEAR); #ifdef CONFIG_MACH_DECSTATION p->real_year = CMOS_READ(RTC_DEC_YEAR); #endif #ifdef CONFIG_ACPI if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && acpi_gbl_FADT.century) { p->century = CMOS_READ(acpi_gbl_FADT.century); } else { p->century = 0; } #endif p->ctrl = CMOS_READ(RTC_CONTROL); } /** * mc146818_get_time - Get the current time from the RTC * @time: pointer to struct rtc_time to store the current time * @timeout: timeout value in ms * * This function reads the current time from the RTC and stores it in the * provided struct rtc_time. The timeout parameter specifies the maximum * time to wait for the RTC to become ready. * * Return: 0 on success, -ETIMEDOUT if the RTC did not become ready within * the specified timeout, or another error code if an error occurred. */ int mc146818_get_time(struct rtc_time *time, int timeout) { struct mc146818_get_time_callback_param p = { .time = time }; if (!mc146818_avoid_UIP(mc146818_get_time_callback, timeout, &p)) { memset(time, 0, sizeof(*time)); return -ETIMEDOUT; } if (!(p.ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { time->tm_sec = bcd2bin(time->tm_sec); time->tm_min = bcd2bin(time->tm_min); time->tm_hour = bcd2bin(time->tm_hour); time->tm_mday = bcd2bin(time->tm_mday); time->tm_mon = bcd2bin(time->tm_mon); time->tm_year = bcd2bin(time->tm_year); #ifdef CONFIG_ACPI p.century = bcd2bin(p.century); #endif } #ifdef CONFIG_MACH_DECSTATION time->tm_year += p.real_year - 72; #endif #ifdef CONFIG_ACPI if (p.century > 19) time->tm_year += (p.century - 19) * 100; #endif /* * Account for differences between how the RTC uses the values * and how they are defined in a struct rtc_time; */ if (time->tm_year <= 69) time->tm_year += 100; time->tm_mon--; return 0; } EXPORT_SYMBOL_GPL(mc146818_get_time); /* AMD systems don't allow access to AltCentury with DV1 */ static bool apply_amd_register_a_behavior(void) { #ifdef CONFIG_X86 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) return true; #endif return false; } /* Set the current date and time in the real time clock. */ int mc146818_set_time(struct rtc_time *time) { unsigned long flags; unsigned char mon, day, hrs, min, sec; unsigned char save_control, save_freq_select; unsigned int yrs; #ifdef CONFIG_MACH_DECSTATION unsigned int real_yrs; #endif unsigned char century = 0; yrs = time->tm_year; mon = time->tm_mon + 1; /* tm_mon starts at zero */ day = time->tm_mday; hrs = time->tm_hour; min = time->tm_min; sec = time->tm_sec; if (yrs > 255) /* They are unsigned */ return -EINVAL; #ifdef CONFIG_MACH_DECSTATION real_yrs = yrs; yrs = 72; /* * We want to keep the year set to 73 until March * for non-leap years, so that Feb, 29th is handled * correctly. */ if (!is_leap_year(real_yrs + 1900) && mon < 3) { real_yrs--; yrs = 73; } #endif #ifdef CONFIG_ACPI if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && acpi_gbl_FADT.century) { century = (yrs + 1900) / 100; yrs %= 100; } #endif /* These limits and adjustments are independent of * whether the chip is in binary mode or not. */ if (yrs > 169) return -EINVAL; if (yrs >= 100) yrs -= 100; spin_lock_irqsave(&rtc_lock, flags); save_control = CMOS_READ(RTC_CONTROL); spin_unlock_irqrestore(&rtc_lock, flags); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { sec = bin2bcd(sec); min = bin2bcd(min); hrs = bin2bcd(hrs); day = bin2bcd(day); mon = bin2bcd(mon); yrs = bin2bcd(yrs); century = bin2bcd(century); } spin_lock_irqsave(&rtc_lock, flags); save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); if (apply_amd_register_a_behavior()) CMOS_WRITE((save_freq_select & ~RTC_AMD_BANK_SELECT), RTC_FREQ_SELECT); else CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif CMOS_WRITE(yrs, RTC_YEAR); CMOS_WRITE(mon, RTC_MONTH); CMOS_WRITE(day, RTC_DAY_OF_MONTH); CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); #ifdef CONFIG_ACPI if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && acpi_gbl_FADT.century) CMOS_WRITE(century, acpi_gbl_FADT.century); #endif CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); spin_unlock_irqrestore(&rtc_lock, flags); return 0; } EXPORT_SYMBOL_GPL(mc146818_set_time);
10 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook */ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ #include <linux/cache.h> #include <linux/list.h> #include <linux/spinlock_types.h> #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) #define NR_BPF_LRU_LOCAL_LIST_T (2) #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T enum bpf_lru_list_type { BPF_LRU_LIST_T_ACTIVE, BPF_LRU_LIST_T_INACTIVE, BPF_LRU_LIST_T_FREE, BPF_LRU_LOCAL_LIST_T_FREE, BPF_LRU_LOCAL_LIST_T_PENDING, }; struct bpf_lru_node { struct list_head list; u16 cpu; u8 type; u8 ref; }; struct bpf_lru_list { struct list_head lists[NR_BPF_LRU_LIST_T]; unsigned int counts[NR_BPF_LRU_LIST_COUNT]; /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; raw_spinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; u16 next_steal; raw_spinlock_t lock; }; struct bpf_common_lru { struct bpf_lru_list lru_list; struct bpf_lru_locallist __percpu *local_list; }; typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); struct bpf_lru { union { struct bpf_common_lru common_lru; struct bpf_lru_list __percpu *percpu_lru; }; del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; unsigned int target_free; unsigned int nr_scans; bool percpu; }; static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) { if (!READ_ONCE(node->ref)) WRITE_ONCE(node->ref, 1); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *delete_arg); void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems); void bpf_lru_destroy(struct bpf_lru *lru); struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); #endif
659 201 519 518 517 518 518 2 2 2 2 2 247 248 248 248 6 242 196 67 248 248 248 248 248 238 240 54 4 207 179 74 237 2 1 238 5 236 234 234 5 239 237 238 239 238 52 189 107 189 97 100 186 3 55 54 22 51 43 1 1 7 5 7 3 55 51 196 196 52 7 236 235 48 193 14 14 14 14 14 10 4 14 75 74 7 280 65 250 59 31 34 39 39 163 306 143 60 197 195 117 271 201 134 137 308 154 138 7 1 12 10 2 5 6 11 218 13 206 337 322 232 2 241 6 6 6 6 6 61 73 28 7 1 34 7 1 7 7 105 248 248 9 2 3 3 1 7 12 2 10 2 7 1 10 296 264 102 21 1 4 12 7 165 9 152 2 5 5 2 3 259 81 159 2 39 39 39 211 213 17 2 15 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2002, 2004 * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * Copyright (c) 2002-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * SCTP over IPv6. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Le Yanqun <yanqun.le@nokia.com> * Hui Huang <hui.huang@nokia.com> * La Monte H.P. Yarroll <piggy@acm.org> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * * Based on: * linux/net/ipv6/tcp_ipv6.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/ipsec.h> #include <linux/slab.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/seq_file.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/sctp/sctp.h> #include <net/udp_tunnel.h> #include <linux/uaccess.h> static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2); static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port); static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2); /* Event handler for inet6 address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same * time and thus corrupt the list. * The reader side is protected with RCU. */ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sctp_sockaddr_entry *addr = NULL; struct sctp_sockaddr_entry *temp; struct net *net = dev_net(ifa->idev->dev); int found = 0; switch (ev) { case NETDEV_UP: addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifa->addr; addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW); spin_unlock_bh(&net->sctp.local_addr_lock); } break; case NETDEV_DOWN: spin_lock_bh(&net->sctp.local_addr_lock); list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET6 && ipv6_addr_equal(&addr->a.v6.sin6_addr, &ifa->addr) && addr->a.v6.sin6_scope_id == ifa->idev->dev->ifindex) { found = 1; addr->valid = 0; list_del_rcu(&addr->list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); break; } } spin_unlock_bh(&net->sctp.local_addr_lock); if (found) kfree_rcu(addr, rcu); break; } return NOTIFY_DONE; } static struct notifier_block sctp_inet6addr_notifier = { .notifier_call = sctp_inet6addr_event, }; static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMPV6_PKT_TOOBIG: if (ip6_sk_accept_pmtu(sk)) sctp_icmp_frag_needed(sk, asoc, t, info); return; case ICMPV6_PARAMPROB: if (ICMPV6_UNK_NEXTHDR == code) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } break; case NDISC_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: break; } icmpv6_err_convert(type, code, &err); if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { sk->sk_err = err; sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } } /* ICMP error handler. */ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, offset); sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original pointers. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } sctp_v6_err_handle(transport, skb, type, code, ntohl(info)); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmp6hdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmp6hdr *)(skb_network_header(skb) - sizeof(struct icmp6hdr)); if (hdr->icmp6_type == NDISC_REDIRECT) { /* can't be handled without outer ip6hdr known, leave it to udpv6_err */ sctp_err_finish(sk, t); return 0; } if (hdr->icmp6_type == ICMPV6_PKT_TOOBIG) info = ntohl(hdr->icmp6_mtu); sctp_v6_err_handle(t, skb, hdr->icmp6_type, hdr->icmp6_code, info); sctp_err_finish(sk, t); return 1; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) { struct dst_entry *dst = dst_clone(t->dst); struct flowi6 *fl6 = &t->fl.u.ip6; struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); __u8 tclass = np->tclass; __be32 label; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); if (t->dscp & SCTP_DSCP_SET_MASK) tclass = t->dscp & SCTP_DSCP_VAL_MASK; if (INET_ECN_is_capable(tclass)) IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(t->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); if (!t->encap_port || !sctp_sk(sk)->udp_port) { int res; skb_dst_set(skb, dst); rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; } if (skb_is_gso(skb)) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; skb->encapsulation = 1; skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, tclass, ip6_dst_hoplimit(dst), label, sctp_sk(sk)->udp_port, t->encap_port, false, 0); return 0; } /* Returns the dst cache entry for the given source and destination ip * addresses. */ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk) { struct sctp_association *asoc = t->asoc; struct dst_entry *dst = NULL; struct flowi _fl; struct flowi6 *fl6 = &_fl.u.ip6; struct sctp_bind_addr *bp; struct ipv6_pinfo *np = inet6_sk(sk); struct sctp_sockaddr_entry *laddr; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; struct in6_addr *final_p, final; enum sctp_scope scope; __u8 matchlen = 0; memset(&_fl, 0, sizeof(_fl)); fl6->daddr = daddr->v6.sin6_addr; fl6->fl6_dport = daddr->v6.sin6_port; fl6->flowi6_proto = IPPROTO_SCTP; if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); if (inet6_test_bit(SNDFLOW, sk) && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) goto out; fl6_sock_release(flowlabel); } pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); if (asoc) fl6->fl6_sport = htons(asoc->base.bind_addr.port); if (saddr) { fl6->saddr = saddr->v6.sin6_addr; if (!fl6->fl6_sport) fl6->fl6_sport = saddr->v6.sin6_port; pr_debug("src=%pI6 - ", &fl6->saddr); } rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!asoc || saddr) { t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } bp = &asoc->base.bind_addr; scope = sctp_scope(daddr); /* ip6_dst_lookup has filled in the fl6->saddr for us. Check * to see if we can use it. */ if (!IS_ERR(dst)) { /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || laddr->state == SCTP_ADDR_DEL || (laddr->state != SCTP_ADDR_SRC && !asoc->src_out_of_asoc_ok)) continue; /* Do not compare against v4 addrs */ if ((laddr->a.sa.sa_family == AF_INET6) && (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) { rcu_read_unlock(); t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } } rcu_read_unlock(); /* None of the bound addresses match the source address of the * dst. So release it. */ dst_release(dst); dst = NULL; } /* Walk through the bind address list and try to get the * best source address for a given destination. */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { struct dst_entry *bdst; __u8 bmatchlen; if (!laddr->valid || laddr->state != SCTP_ADDR_SRC || laddr->a.sa.sa_family != AF_INET6 || scope > sctp_scope(&laddr->a)) continue; fl6->saddr = laddr->a.v6.sin6_addr; fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(bdst)) continue; if (ipv6_chk_addr(dev_net(bdst->dev), &laddr->a.v6.sin6_addr, bdst->dev, 1)) { if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); break; } bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); if (matchlen > bmatchlen) { dst_release(bdst); continue; } if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; matchlen = bmatchlen; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } rcu_read_unlock(); out: if (!IS_ERR_OR_NULL(dst)) { struct rt6_info *rt; rt = dst_rt6_info(dst); t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl->u.ip6.saddr); } else { t->dst = NULL; pr_debug("no route\n"); } } /* Returns the number of consecutive initial bits that match in the 2 ipv6 * addresses. */ static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2) { return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr); } /* Fills in the source address(saddr) based on the destination address(daddr) * and asoc's bind address list. */ static void sctp_v6_get_saddr(struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl) { struct flowi6 *fl6 = &fl->u.ip6; union sctp_addr *saddr = &t->saddr; pr_debug("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst); if (t->dst) { saddr->v6.sin6_family = AF_INET6; saddr->v6.sin6_addr = fl6->saddr; } } /* Make a copy of all potential local addresses. */ static void sctp_v6_copy_addrlist(struct list_head *addrlist, struct net_device *dev) { struct inet6_dev *in6_dev; struct inet6_ifaddr *ifp; struct sctp_sockaddr_entry *addr; rcu_read_lock(); if ((in6_dev = __in6_dev_get(dev)) == NULL) { rcu_read_unlock(); return; } read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifp->addr; addr->a.v6.sin6_scope_id = dev->ifindex; addr->valid = 1; INIT_LIST_HEAD(&addr->list); list_add_tail(&addr->list, addrlist); } } read_unlock_bh(&in6_dev->lock); rcu_read_unlock(); } /* Copy over any ip options */ static void sctp_v6_copy_ip_options(struct sock *sk, struct sock *newsk) { struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct ipv6_txoptions *opt; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } /* Account for the IP options */ static int sctp_v6_ip_options_len(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; int len = 0; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) len = opt->opt_flen + opt->opt_nflen; rcu_read_unlock(); return len; } /* Initialize a sockaddr_storage from in incoming skb. */ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { /* Always called on head skb, so this is safe */ struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in6 *sa = &addr->v6; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; if (is_saddr) { sa->sin6_port = sh->source; sa->sin6_addr = ipv6_hdr(skb)->saddr; } else { sa->sin6_port = sh->dest; sa->sin6_addr = ipv6_hdr(skb)->daddr; } } /* Initialize an sctp_addr from a socket. */ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = sk->sk_v6_rcv_saddr; addr->v6.sin6_scope_id = 0; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_rcv_saddr.s6_addr32[0] = 0; sk->sk_v6_rcv_saddr.s6_addr32[1] = 0; sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_rcv_saddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_rcv_saddr = addr->v6.sin6_addr; } } /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_daddr.s6_addr32[0] = 0; sk->sk_v6_daddr.s6_addr32[1] = 0; sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_daddr = addr->v6.sin6_addr; } } /* Initialize a sctp_addr from an address parameter. */ static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) return false; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; return true; } /* Initialize an address parameter from a sctp_addr and return the length * of the address parameter. */ static int sctp_v6_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { int length = sizeof(struct sctp_ipv6addr_param); param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; param->v6.param_hdr.length = htons(length); param->v6.addr = addr->v6.sin6_addr; return length; } /* Initialize a sctp_addr from struct in6_addr. */ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port) { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; addr->v6.sin6_scope_id = 0; } static int __sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { if (addr1->sa.sa_family != addr2->sa.sa_family) { if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr2->v6.sin6_addr) && addr2->v6.sin6_addr.s6_addr32[3] == addr1->v4.sin_addr.s_addr) return 1; if (addr2->sa.sa_family == AF_INET && addr1->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr1->v6.sin6_addr) && addr1->v6.sin6_addr.s6_addr32[3] == addr2->v4.sin_addr.s_addr) return 1; return 0; } if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) return 0; /* If this is a linklocal address, compare the scope_id. */ if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) && addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id && addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id) return 0; return 1; } /* Compare addresses exactly. * v4-mapped-v6 is also in consideration. */ static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { return __sctp_v6_cmp_addr(addr1, addr2) && addr1->v6.sin6_port == addr2->v6.sin6_port; } /* Initialize addr struct to INADDR_ANY. */ static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port) { memset(addr, 0x00, sizeof(union sctp_addr)); addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; } /* Is this a wildcard address? */ static int sctp_v6_is_any(const union sctp_addr *addr) { return ipv6_addr_any(&addr->v6.sin6_addr); } /* Should this be available for binding? */ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) { const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; struct sock *sk = &sp->inet.sk; struct net *net = sock_net(sk); struct net_device *dev = NULL; int type, res, bound_dev_if; type = ipv6_addr_type(in6); if (IPV6_ADDR_ANY == type) return 1; if (type == IPV6_ADDR_MAPPED) { if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->available(addr, sp); } if (!(type & IPV6_ADDR_UNICAST)) return 0; rcu_read_lock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if) { res = 0; dev = dev_get_by_index_rcu(net, bound_dev_if); if (!dev) goto out; } res = ipv6_can_nonlocal_bind(net, &sp->inet) || ipv6_chk_addr(net, in6, dev, 0); out: rcu_read_unlock(); return res; } /* This function checks if the address is a valid address to be used for * SCTP. * * Output: * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp, const struct sk_buff *skb) { int ret = ipv6_addr_type(&addr->v6.sin6_addr); /* Support v4-mapped-v6 address. */ if (ret == IPV6_ADDR_MAPPED) { /* Note: This routine is used in input, so v4-mapped-v6 * are disallowed here when there is no sctp_sock. */ if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); } /* Is this a non-unicast address */ if (!(ret & IPV6_ADDR_UNICAST)) return 0; return 1; } /* What is the scope of 'addr'? */ static enum sctp_scope sctp_v6_scope(union sctp_addr *addr) { enum sctp_scope retval; int v6scope; /* The IPv6 scope is really a set of bit fields. * See IFA_* in <net/if_inet6.h>. Map to a generic SCTP scope. */ v6scope = ipv6_addr_scope(&addr->v6.sin6_addr); switch (v6scope) { case IFA_HOST: retval = SCTP_SCOPE_LOOPBACK; break; case IFA_LINK: retval = SCTP_SCOPE_LINK; break; case IFA_SITE: retval = SCTP_SCOPE_PRIVATE; break; default: retval = SCTP_SCOPE_GLOBAL; break; } return retval; } /* Create and initialize a new sk for the socket to be returned by accept(). */ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); if (!newsk) goto out; sock_init_data(NULL, newsk); sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(sk, SOCK_ZAPPED); newsctp6sk = (struct sctp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; sctp_v6_copy_ip_options(sk, newsk); /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() * and getpeername(). */ sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; } out: return newsk; } /* Format a sockaddr for return to user space. This makes sure the return is * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. */ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { if (sp->v4mapped) { if (addr->sa.sa_family == AF_INET) sctp_v4_map_v6(addr); } else { if (addr->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr->v6.sin6_addr)) sctp_v6_map_v4(addr); } if (addr->sa.sa_family == AF_INET) { memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } return sizeof(struct sockaddr_in6); } /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { return inet6_iif(skb); } static int sctp_v6_skb_sdif(const struct sk_buff *skb) { return inet6_sdif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v6_is_ce(const struct sk_buff *skb) { return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20); } /* Dump the v6 addr to the seq file. */ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr); } static void sctp_v6_ecn_capable(struct sock *sk) { inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } /* Initialize a PF_INET msgname from a ulpevent. */ static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addrlen) { union sctp_addr *addr; struct sctp_association *asoc; union sctp_addr *paddr; if (!msgname) return; addr = (union sctp_addr *)msgname; asoc = event->asoc; paddr = &asoc->peer.primary_addr; if (paddr->sa.sa_family == AF_INET) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = htons(asoc->peer.port); addr->v4.sin_addr = paddr->v4.sin_addr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; if (ipv6_addr_type(&paddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = paddr->v6.sin6_scope_id; else addr->v6.sin6_scope_id = 0; addr->v6.sin6_port = htons(asoc->peer.port); addr->v6.sin6_addr = paddr->v6.sin6_addr; } *addrlen = sctp_v6_addr_to_user(sctp_sk(asoc->base.sk), addr); } /* Initialize a msg_name from an inbound skb. */ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, int *addr_len) { union sctp_addr *addr; struct sctphdr *sh; if (!msgname) return; addr = (union sctp_addr *)msgname; sh = sctp_hdr(skb); if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = sh->source; addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); else addr->v6.sin6_scope_id = 0; } *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); } /* Do we support this AF? */ static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) { switch (family) { case AF_INET6: return 1; /* v4-mapped-v6 addresses */ case AF_INET: if (!ipv6_only_sock(sctp_opt2sk(sp))) return 1; fallthrough; default: return 0; } } /* Address matching with wildcards allowed. This extra level * of indirection lets us choose whether a PF_INET6 should * disallow any v4 addresses if we so choose. */ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2, struct sctp_sock *opt) { struct sock *sk = sctp_opt2sk(opt); struct sctp_af *af1, *af2; af1 = sctp_get_af_specific(addr1->sa.sa_family); af2 = sctp_get_af_specific(addr2->sa.sa_family); if (!af1 || !af2) return 0; /* If the socket is IPv6 only, v4 addrs will not match */ if (ipv6_only_sock(sk) && af1 != af2) return 0; /* Today, wildcard AF_INET/AF_INET6. */ if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2)) return 1; if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET) return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr; return __sctp_v6_cmp_addr(addr1, addr2); } /* Verify that the provided sockaddr looks bindable. Common verification, * has already been taken care of. */ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { struct net *net; if (!addr->v6.sin6_scope_id) return 0; net = sock_net(&opt->inet.sk); rcu_read_lock(); dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id); if (!dev || !(ipv6_can_nonlocal_bind(net, &opt->inet) || ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0))) { rcu_read_unlock(); return 0; } rcu_read_unlock(); } af = opt->pf->af; } return af->available(addr, opt); } /* Verify that the provided sockaddr looks sendable. Common verification, * has already been taken care of. */ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af = NULL; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { if (!addr->v6.sin6_scope_id) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(&opt->inet.sk), addr->v6.sin6_scope_id); rcu_read_unlock(); if (!dev) return 0; } af = opt->pf->af; } return af != NULL; } /* Fill in Supported Address Type information for INIT and INIT-ACK * chunks. Note: In the future, we may want to look at sock options * to determine whether a PF_INET6 socket really wants to have IPV4 * addresses. * Returns number of addresses supported. */ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, __be16 *types) { types[0] = SCTP_PARAM_IPV6_ADDRESS; if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) { types[1] = SCTP_PARAM_IPV4_ADDRESS; return 2; } return 1; } /* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */ static int sctp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int rc; rc = inet6_getname(sock, uaddr, peer); if (rc < 0) return rc; rc = sctp_v6_addr_to_user(sctp_sk(sock->sk), (union sctp_addr *)uaddr); return rc; } static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = sctp_inet_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, .poll = sctp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw sctpv6_seqpacket_protosw = { .type = SOCK_SEQPACKET, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctpv6_stream_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG, }; static int sctp6_rcv(struct sk_buff *skb) { SCTP_INPUT_CB(skb)->encap_port = 0; return sctp_rcv(skb) ? -1 : 0; } static const struct inet6_protocol sctpv6_protocol = { .handler = sctp6_rcv, .err_handler = sctp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; static struct sctp_af sctp_af_inet6 = { .sa_family = AF_INET6, .sctp_xmit = sctp_v6_xmit, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .get_dst = sctp_v6_get_dst, .get_saddr = sctp_v6_get_saddr, .copy_addrlist = sctp_v6_copy_addrlist, .from_skb = sctp_v6_from_skb, .from_sk = sctp_v6_from_sk, .from_addr_param = sctp_v6_from_addr_param, .to_addr_param = sctp_v6_to_addr_param, .cmp_addr = sctp_v6_cmp_addr, .scope = sctp_v6_scope, .addr_valid = sctp_v6_addr_valid, .inaddr_any = sctp_v6_inaddr_any, .is_any = sctp_v6_is_any, .available = sctp_v6_available, .skb_iif = sctp_v6_skb_iif, .skb_sdif = sctp_v6_skb_sdif, .is_ce = sctp_v6_is_ce, .seq_dump_addr = sctp_v6_seq_dump_addr, .ecn_capable = sctp_v6_ecn_capable, .net_header_len = sizeof(struct ipv6hdr), .sockaddr_len = sizeof(struct sockaddr_in6), .ip_options_len = sctp_v6_ip_options_len, }; static struct sctp_pf sctp_pf_inet6 = { .event_msgname = sctp_inet6_event_msgname, .skb_msgname = sctp_inet6_skb_msgname, .af_supported = sctp_inet6_af_supported, .cmp_addr = sctp_inet6_cmp_addr, .bind_verify = sctp_inet6_bind_verify, .send_verify = sctp_inet6_send_verify, .supported_addrs = sctp_inet6_supported_addrs, .create_accept_sk = sctp_v6_create_accept_sk, .addr_to_user = sctp_v6_addr_to_user, .to_sk_saddr = sctp_v6_to_sk_saddr, .to_sk_daddr = sctp_v6_to_sk_daddr, .copy_ip_options = sctp_v6_copy_ip_options, .af = &sctp_af_inet6, }; /* Initialize IPv6 support and register with socket layer. */ void sctp_v6_pf_init(void) { /* Register the SCTP specific PF_INET6 functions. */ sctp_register_pf(&sctp_pf_inet6, PF_INET6); /* Register the SCTP specific AF_INET6 functions. */ sctp_register_af(&sctp_af_inet6); } void sctp_v6_pf_exit(void) { list_del(&sctp_af_inet6.list); } /* Initialize IPv6 support and register with socket layer. */ int sctp_v6_protosw_init(void) { int rc; rc = proto_register(&sctpv6_prot, 1); if (rc) return rc; /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ inet6_register_protosw(&sctpv6_seqpacket_protosw); inet6_register_protosw(&sctpv6_stream_protosw); return 0; } void sctp_v6_protosw_exit(void) { inet6_unregister_protosw(&sctpv6_seqpacket_protosw); inet6_unregister_protosw(&sctpv6_stream_protosw); proto_unregister(&sctpv6_prot); } /* Register with inet6 layer. */ int sctp_v6_add_protocol(void) { /* Register notifier for inet6 address additions/deletions. */ register_inet6addr_notifier(&sctp_inet6addr_notifier); if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) return -EAGAIN; return 0; } /* Unregister with inet6 layer. */ void sctp_v6_del_protocol(void) { inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); unregister_inet6addr_notifier(&sctp_inet6addr_notifier); }
1 49 1 1 49 49 49 18 9 11 11 9 2 3 3 16 16 14 4 8 8 24 24 16 8 18 18 8 10 4 2 1 2 50 21 29 17 5 4 1 2 2 1 1 7 4 1 2 1 9 255 255 1 2 251 23 230 4 228 228 2 168 86 1 1 1 1 1 1 1 1 210 6 5 2 2 198 18 2 3 23 1 2 1 2 1 2 2 14 7 1 1 3 2 18 1 1 2 2 15 2 1 1 1 1 2 1 1 30 1 1 2 1 1 1 26 33 33 18 18 18 15 15 15 98 1 3 2 1 95 66 1 2 2 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 3 2 60 2 1 7 18 13 2 1 10 11 21 1 1 17 8 9 2 18 8 9 1 6 1 5 5 1 4 3 3 3 3 9 9 1 1 1 5 5 4 23 22 22 1 14 14 14 5 627 628 629 55 243 188 87 30 5 3 5 6 14 2 21 1 1 19 11 6 1 1 3 1 3 1 3 1 1 2 6 8 6 4 4 3 1 4 1 1 1 1 2 2 1 1 1 1 43 1 43 1 1 1 1 1 1 32 4 4 5 1 1 33 33 25 7 28 3 3 30 6 1 23 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bpf_perf_event.h> #include <linux/btf.h> #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/error-injection.h> #include <linux/btf_ids.h> #include <linux/bpf_lsm.h> #include <linux/fprobe.h> #include <linux/bsearch.h> #include <linux/sort.h> #include <linux/key.h> #include <linux/namei.h> #include <net/bpf_sk_storage.h> #include <uapi/linux/bpf.h> #include <uapi/linux/btf.h> #include <asm/tlb.h> #include "trace_probe.h" #include "trace.h" #define CREATE_TRACE_POINTS #include "bpf_trace.h" #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) #define MAX_UPROBE_MULTI_CNT (1U << 20) #define MAX_KPROBE_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; struct list_head list; }; static LIST_HEAD(bpf_trace_modules); static DEFINE_MUTEX(bpf_module_mutex); static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) { struct bpf_raw_event_map *btp, *ret = NULL; struct bpf_trace_module *btm; unsigned int i; mutex_lock(&bpf_module_mutex); list_for_each_entry(btm, &bpf_trace_modules, list) { for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { btp = &btm->module->bpf_raw_events[i]; if (!strcmp(btp->tp->name, name)) { if (try_module_get(btm->module)) ret = btp; goto out; } } } out: mutex_unlock(&bpf_module_mutex); return ret; } #else static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) { return NULL; } #endif /* CONFIG_MODULES */ u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags, const struct btf **btf, s32 *btf_id); static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx); static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx); /** * trace_call_bpf - invoke BPF program * @call: tracepoint event * @ctx: opaque context pointer * * kprobe handlers execute BPF programs via this helper. * Can be used from static tracepoints in the future. * * Return: BPF programs always return an integer which is interpreted by * kprobe handler as: * 0 - return from kprobe (event is filtered out) * 1 - store kprobe event into ring buffer * Other values are reserved and currently alias to 1 */ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { /* * since some bpf program is already running on this cpu, * don't call into another bpf program (same or different) * and don't send kprobe event into ring-buffer, * so return zero here */ rcu_read_lock(); bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array)); rcu_read_unlock(); ret = 0; goto out; } /* * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock * to all call sites, we did a bpf_prog_array_valid() there to check * whether call->prog_array is empty or not, which is * a heuristic to speed up execution. * * If bpf_prog_array_valid() fetched prog_array was * non-NULL, we go into trace_call_bpf() and do the actual * proper rcu_dereference() under RCU lock. * If it turns out that prog_array is NULL then, we bail out. * For the opposite, if the bpf_prog_array_valid() fetched pointer * was NULL, you'll skip the prog_array with the risk of missing * out of events when it was updated in between this and the * rcu_dereference() which is accepted risk. */ rcu_read_lock(); ret = bpf_prog_run_array(rcu_dereference(call->prog_array), ctx, bpf_prog_run); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); return ret; } #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { regs_set_return_value(regs, rc); override_function_with_return(regs); return 0; } static const struct bpf_func_proto bpf_override_return_proto = { .func = bpf_override_return, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; #endif static __always_inline int bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { int ret; ret = copy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, const void __user *, unsafe_ptr) { return bpf_probe_read_user_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_user_proto = { .func = bpf_probe_read_user, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; static __always_inline int bpf_probe_read_user_str_common(void *dst, u32 size, const void __user *unsafe_ptr) { int ret; /* * NB: We rely on strncpy_from_user() not copying junk past the NUL * terminator into `dst`. * * strncpy_from_user() does long-sized strides in the fast path. If the * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, * then there could be junk after the NUL in `dst`. If user takes `dst` * and keys a hash map with it, then semantically identical strings can * occupy multiple entries in the map. */ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, const void __user *, unsafe_ptr) { return bpf_probe_read_user_str_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_user_str_proto = { .func = bpf_probe_read_user_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, unsafe_ptr) { return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_proto = { .func = bpf_probe_read_kernel, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; static __always_inline int bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { int ret; /* * The strncpy_from_kernel_nofault() call will likely not fill the * entire buffer, but that's okay in this circumstance as we're probing * arbitrary memory anyway similar to bpf_probe_read_*() and might * as well probe the stack. Thus, memory is explicitly cleared * only in error case, so that improper users ignoring return * code altogether don't copy garbage; otherwise length of string * is returned that can be used for bpf_perf_event_output() et al. */ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); return ret; } BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, const void *, unsafe_ptr) { return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { .func = bpf_probe_read_kernel_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, const void *, unsafe_ptr) { if ((unsigned long)unsafe_ptr < TASK_SIZE) { return bpf_probe_read_user_common(dst, size, (__force void __user *)unsafe_ptr); } return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_proto = { .func = bpf_probe_read_compat, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, const void *, unsafe_ptr) { if ((unsigned long)unsafe_ptr < TASK_SIZE) { return bpf_probe_read_user_str_common(dst, size, (__force void __user *)unsafe_ptr); } return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); } static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { .func = bpf_probe_read_compat_str, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, u32, size) { /* * Ensure we're in user context which is safe for the helper to * run. This helper has no business in a kthread. * * access_ok() should prevent writing to non-user memory, but in * some situations (nommu, temporary switch, etc) access_ok() does * not provide enough validation, hence the check on KERNEL_DS. * * nmi_uaccess_okay() ensures the probe is not run in an interim * state, when the task or mm are switched. This is specifically * required to prevent the use of temporary mm. */ if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; return copy_to_user_nofault(unsafe_ptr, src, size); } static const struct bpf_func_proto bpf_probe_write_user_proto = { .func = bpf_probe_write_user, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; #define MAX_TRACE_PRINTK_VARARGS 3 #define BPF_TRACE_PRINTK_SIZE 1024 BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; int ret; ret = bpf_bprintf_prepare(fmt, fmt_size, args, MAX_TRACE_PRINTK_VARARGS, &data); if (ret < 0) return ret; ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); trace_bpf_trace_printk(data.buf); bpf_bprintf_cleanup(&data); return ret; } static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, }; static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, * so enable the associated bpf_trace/bpf_trace_printk event. * Repeat this each time as it is possible a user has * disabled bpf_trace_printk events. By loading a program * calling bpf_trace_printk() however the user has expressed * the intent to see such events. */ if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; int ret, num_args; if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); if (ret < 0) return ret; ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); trace_bpf_trace_printk(data.buf); bpf_bprintf_cleanup(&data); return ret; } static const struct bpf_func_proto bpf_trace_vprintk_proto = { .func = bpf_trace_vprintk, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, }; const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, args, u32, data_len) { struct bpf_bprintf_data data = { .get_bin_args = true, }; int err, num_args; if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || (data_len && !args)) return -EINVAL; num_args = data_len / 8; err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); if (err < 0) return err; seq_bprintf(m, fmt, data.bin_args); bpf_bprintf_cleanup(&data); return seq_has_overflowed(m) ? -EOVERFLOW : 0; } BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) { return seq_write(m, data, len) ? -EOVERFLOW : 0; } static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, u32, btf_ptr_size, u64, flags) { const struct btf *btf; s32 btf_id; int ret; ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); if (ret) return ret; return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); } static const struct bpf_func_proto bpf_seq_printf_btf_proto = { .func = bpf_seq_printf_btf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; if (index == BPF_F_CURRENT_CPU) index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); if (!ee) return -ENOENT; return perf_event_read_local(ee->event, value, enabled, running); } BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) { u64 value = 0; int err; err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi */ if (err) return err; return value; } const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_perf_event_value))) goto clear; err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, &buf->running); if (unlikely(err)) goto clear; return 0; clear: memset(buf, 0, size); return err; } static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .func = bpf_perf_event_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void) { return &bpf_perf_event_read_value_proto; } static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; struct perf_event *event; if (index == BPF_F_CURRENT_CPU) index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); if (!ee) return -ENOENT; event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; perf_sample_save_raw_data(sd, event, raw); return perf_event_output(event, sd, regs); } /* * Support executing tracepoints in normal, irq, and nmi context that each call * bpf_perf_event_output */ struct bpf_trace_sample_data { struct perf_sample_data sds[3]; }; static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, .data = data, }, }; struct perf_sample_data *sd; int nest_level, err; preempt_disable(); sds = this_cpu_ptr(&bpf_trace_sds); nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; goto out; } sd = &sds->sds[nest_level - 1]; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { err = -EINVAL; goto out; } perf_sample_data_init(sd, 0, 0); err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); return err; } static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static DEFINE_PER_CPU(int, bpf_event_output_nest_level); struct bpf_nested_pt_regs { struct pt_regs regs[3]; }; static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, .data = ctx, }; struct perf_raw_record raw = { .frag = { { .next = ctx_size ? &frag : NULL, }, .size = meta_size, .data = meta, }, }; struct perf_sample_data *sd; struct pt_regs *regs; int nest_level; u64 ret; preempt_disable(); nest_level = this_cpu_inc_return(bpf_event_output_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; } sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]); perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); return ret; } BPF_CALL_0(bpf_get_current_task) { return (long) current; } const struct bpf_func_proto bpf_get_current_task_proto = { .func = bpf_get_current_task, .gpl_only = true, .ret_type = RET_INTEGER, }; BPF_CALL_0(bpf_get_current_task_btf) { return (unsigned long) current; } const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID_TRUSTED, .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) { return (unsigned long) task_pt_regs(task); } BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs) const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, .gpl_only = true, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .ret_type = RET_PTR_TO_BTF_ID, .ret_btf_id = &bpf_task_pt_regs_ids[0], }; struct send_signal_irq_work { struct irq_work irq_work; struct task_struct *task; u32 sig; enum pid_type type; bool has_siginfo; struct kernel_siginfo info; }; static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); static void do_bpf_send_signal(struct irq_work *entry) { struct send_signal_irq_work *work; struct kernel_siginfo *siginfo; work = container_of(entry, struct send_signal_irq_work, irq_work); siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV; group_send_sig_info(work->sig, siginfo, work->task, work->type); put_task_struct(work->task); } static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value) { struct send_signal_irq_work *work = NULL; struct kernel_siginfo info; struct kernel_siginfo *siginfo; if (!task) { task = current; siginfo = SEND_SIG_PRIV; } else { clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; info.si_value.sival_ptr = (void *)(unsigned long)value; siginfo = &info; } /* Similar to bpf_probe_write_user, task needs to be * in a sound condition and kernel memory access be * permitted in order to send signal to the current * task. */ if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; /* Task should not be pid=1 to avoid kernel panic. */ if (unlikely(is_global_init(task))) return -EPERM; if (preempt_count() != 0 || irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ if (unlikely(!valid_signal(sig))) return -EINVAL; work = this_cpu_ptr(&send_signal_work); if (irq_work_is_busy(&work->irq_work)) return -EBUSY; /* Add the current task, which is the target of sending signal, * to the irq_work. The current task may change when queued * irq works get executed. */ work->task = get_task_struct(task); work->has_siginfo = siginfo == &info; if (work->has_siginfo) copy_siginfo(&work->info, &info); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); return 0; } return group_send_sig_info(sig, siginfo, task, type); } BPF_CALL_1(bpf_send_signal, u32, sig) { return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_send_signal_thread, u32, sig) { return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_d_path, const struct path *, path, char *, buf, u32, sz) { struct path copy; long len; char *p; if (!sz) return 0; /* * The path pointer is verified as trusted and safe to use, * but let's double check it's valid anyway to workaround * potentially broken verifier. */ len = copy_from_kernel_nofault(&copy, path, sizeof(*path)); if (len < 0) return len; p = d_path(&copy, buf, sz); if (IS_ERR(p)) { len = PTR_ERR(p); } else { len = buf + sz - p; memmove(buf, p, len); } return len; } BTF_SET_START(btf_allowlist_d_path) #ifdef CONFIG_SECURITY BTF_ID(func, security_file_permission) BTF_ID(func, security_inode_getattr) BTF_ID(func, security_file_open) #endif #ifdef CONFIG_SECURITY_PATH BTF_ID(func, security_path_truncate) #endif BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) BTF_ID(func, vfs_getattr) BTF_ID(func, filp_close) BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_ITER) return true; if (prog->type == BPF_PROG_TYPE_LSM) return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); } BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) static const struct bpf_func_proto bpf_d_path_proto = { .func = bpf_d_path, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_d_path_btf_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .allowed = bpf_d_path_allowed, }; #define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ BTF_F_PTR_RAW | BTF_F_ZERO) static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags, const struct btf **btf, s32 *btf_id) { const struct btf_type *t; if (unlikely(flags & ~(BTF_F_ALL))) return -EINVAL; if (btf_ptr_size != sizeof(struct btf_ptr)) return -EINVAL; *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; else return -EINVAL; if (*btf_id > 0) t = btf_type_by_id(*btf, *btf_id); if (*btf_id <= 0 || !t) return -ENOENT; return 0; } BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, u32, btf_ptr_size, u64, flags) { const struct btf *btf; s32 btf_id; int ret; ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); if (ret) return ret; return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, flags); } const struct bpf_func_proto bpf_snprintf_btf_proto = { .func = bpf_snprintf_btf, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) { /* This helper call is inlined by verifier. */ return ((u64 *)ctx)[-2]; } static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .func = bpf_get_func_ip_tracing, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static inline unsigned long get_entry_ip(unsigned long fentry_ip) { #ifdef CONFIG_X86_KERNEL_IBT if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; #endif return fentry_ip; } BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { struct bpf_trace_run_ctx *run_ctx __maybe_unused; struct kprobe *kp; #ifdef CONFIG_UPROBES run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); if (run_ctx->is_uprobe) return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr; #endif kp = kprobe_running(); if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) return 0; return get_entry_ip((uintptr_t)kp->addr); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .func = bpf_get_func_ip_kprobe, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs) { return bpf_kprobe_multi_entry_ip(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = { .func = bpf_get_func_ip_kprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs) { return bpf_kprobe_multi_cookie(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { .func = bpf_get_attach_cookie_kprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs) { return bpf_uprobe_multi_entry_ip(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = { .func = bpf_get_func_ip_uprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs) { return bpf_uprobe_multi_cookie(current->bpf_ctx); } static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = { .func = bpf_get_attach_cookie_uprobe_multi, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); return run_ctx->bpf_cookie; } static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = { .func = bpf_get_attach_cookie_trace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx) { return ctx->event->bpf_cookie; } static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { .func = bpf_get_attach_cookie_pe, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); return run_ctx->bpf_cookie; } static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { .func = bpf_get_attach_cookie_tracing, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) { static const u32 br_entry_size = sizeof(struct perf_branch_entry); u32 entry_cnt = size / br_entry_size; entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt); if (unlikely(flags)) return -EINVAL; if (!entry_cnt) return -ENOENT; return entry_cnt * br_entry_size; } const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) { /* This helper call is inlined by verifier. */ u64 nr_args = ((u64 *)ctx)[-1]; if ((u64) n >= nr_args) return -EINVAL; *value = ((u64 *)ctx)[n]; return 0; } static const struct bpf_func_proto bpf_get_func_arg_proto = { .func = get_func_arg, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg3_size = sizeof(u64), }; BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) { /* This helper call is inlined by verifier. */ u64 nr_args = ((u64 *)ctx)[-1]; *value = ((u64 *)ctx)[nr_args]; return 0; } static const struct bpf_func_proto bpf_get_func_ret_proto = { .func = get_func_ret, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg2_size = sizeof(u64), }; BPF_CALL_1(get_func_arg_cnt, void *, ctx) { /* This helper call is inlined by verifier. */ return ((u64 *)ctx)[-1]; } static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .func = get_func_arg_cnt, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func_proto; switch (func_id) { case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; default: break; } func_proto = bpf_base_func_proto(func_id, prog); if (func_proto) return func_proto; if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN)) return NULL; switch (func_id) { case BPF_FUNC_probe_write_user: return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? NULL : &bpf_probe_write_user_proto; default: return NULL; } } static bool is_kprobe_multi(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI || prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_kprobe_session(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_uprobe_multi(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI || prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; } static inline bool is_uprobe_session(const struct bpf_prog *prog) { return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; } static const struct bpf_func_proto * kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; case BPF_FUNC_get_stack: return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE case BPF_FUNC_override_return: return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: if (is_kprobe_multi(prog)) return &bpf_get_func_ip_proto_kprobe_multi; if (is_uprobe_multi(prog)) return &bpf_get_func_ip_proto_uprobe_multi; return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: if (is_kprobe_multi(prog)) return &bpf_get_attach_cookie_proto_kmulti; if (is_uprobe_multi(prog)) return &bpf_get_attach_cookie_proto_umulti; return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; if (off % size != 0) return false; /* * Assertion for 32 bit to make sure last 8 byte access * (BPF_DW) to the last 4 byte member is disallowed. */ if (off + size > sizeof(struct pt_regs)) return false; if (type == BPF_WRITE) prog->aux->kprobe_write_ctx = true; return true; } const struct bpf_verifier_ops kprobe_verifier_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; const struct bpf_prog_ops kprobe_prog_ops = { }; BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { struct pt_regs *regs = *(struct pt_regs **)tp_buff; /* * r1 points to perf tracepoint buffer where first 8 bytes are hidden * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it * from there and call the same bpf_perf_event_output() helper inline. */ return ____bpf_perf_event_output(regs, map, flags, data, size); } static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .func = bpf_perf_event_output_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, u64, flags) { struct pt_regs *regs = *(struct pt_regs **)tp_buff; /* * Same comment as in bpf_perf_event_output_tp(), only that this time * the other helper's function body cannot be inlined due to being * external, thus we need to call raw helper function. */ return bpf_get_stackid((unsigned long) regs, (unsigned long) map, flags, 0, 0); } static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .func = bpf_get_stackid_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = *(struct pt_regs **)tp_buff; return bpf_get_stack((unsigned long) regs, (unsigned long) buf, (unsigned long) size, flags, 0); } static const struct bpf_func_proto bpf_get_stack_proto_tp = { .func = bpf_get_stack_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static const struct bpf_func_proto * tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; case BPF_FUNC_get_attach_cookie: return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); return true; } const struct bpf_verifier_ops tracepoint_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; const struct bpf_prog_ops tracepoint_prog_ops = { }; BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; if (unlikely(size != sizeof(struct bpf_perf_event_value))) goto clear; err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, &buf->running); if (unlikely(err)) goto clear; return 0; clear: memset(buf, 0, size); return err; } static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .func = bpf_perf_prog_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { static const u32 br_entry_size = sizeof(struct perf_branch_entry); struct perf_branch_stack *br_stack = ctx->data->br_stack; u32 to_copy; if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) return -ENOENT; if (unlikely(!br_stack)) return -ENOENT; if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) return br_stack->nr * br_entry_size; if (!buf || (size % br_entry_size != 0)) return -EINVAL; to_copy = min_t(u32, br_stack->nr * br_entry_size, size); memcpy(buf, br_stack->entries, to_copy); return to_copy; } static const struct bpf_func_proto bpf_read_branch_records_proto = { .func = bpf_read_branch_records, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM_OR_NULL, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static const struct bpf_func_proto * pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_pe; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_pe; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; case BPF_FUNC_get_attach_cookie: return &bpf_get_attach_cookie_proto_pe; default: return bpf_tracing_func_proto(func_id, prog); } } /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. * * Since raw tracepoints run despite bpf_prog_active, support concurrent usage * in normal, irq, and nmi context. */ struct bpf_raw_tp_regs { struct pt_regs regs[3]; }; static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); static struct pt_regs *get_bpf_raw_tp_regs(void) { struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } return &tp_regs->regs[nest_level - 1]; } static void put_bpf_raw_tp_regs(void) { this_cpu_dec(bpf_raw_tp_nest_level); } BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags, void *, data, u64, size) { struct pt_regs *regs = get_bpf_raw_tp_regs(); int ret; if (IS_ERR(regs)) return PTR_ERR(regs); perf_fetch_caller_regs(regs); ret = ____bpf_perf_event_output(regs, map, flags, data, size); put_bpf_raw_tp_regs(); return ret; } static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .func = bpf_perf_event_output_raw_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; extern const struct bpf_func_proto bpf_skb_output_proto; extern const struct bpf_func_proto bpf_xdp_output_proto; extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto; BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { struct pt_regs *regs = get_bpf_raw_tp_regs(); int ret; if (IS_ERR(regs)) return PTR_ERR(regs); perf_fetch_caller_regs(regs); /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, flags, 0, 0); put_bpf_raw_tp_regs(); return ret; } static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .func = bpf_get_stackid_raw_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = get_bpf_raw_tp_regs(); int ret; if (IS_ERR(regs)) return PTR_ERR(regs); perf_fetch_caller_regs(regs); ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, (unsigned long) size, flags, 0); put_bpf_raw_tp_regs(); return ret; } static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .func = bpf_get_stack_raw_tp, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static const struct bpf_func_proto * raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto_raw_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; case BPF_FUNC_get_attach_cookie: return &bpf_get_attach_cookie_proto_tracing; default: return bpf_tracing_func_proto(func_id, prog); } } const struct bpf_func_proto * tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *fn; switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_skb_output: return &bpf_skb_output_proto; case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; case BPF_FUNC_skc_to_tcp6_sock: return &bpf_skc_to_tcp6_sock_proto; case BPF_FUNC_skc_to_tcp_sock: return &bpf_skc_to_tcp_sock_proto; case BPF_FUNC_skc_to_tcp_timewait_sock: return &bpf_skc_to_tcp_timewait_sock_proto; case BPF_FUNC_skc_to_tcp_request_sock: return &bpf_skc_to_tcp_request_sock_proto; case BPF_FUNC_skc_to_udp6_sock: return &bpf_skc_to_udp6_sock_proto; case BPF_FUNC_skc_to_unix_sock: return &bpf_skc_to_unix_sock_proto; case BPF_FUNC_skc_to_mptcp_sock: return &bpf_skc_to_mptcp_sock_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_tracing_proto; case BPF_FUNC_sock_from_file: return &bpf_sock_from_file_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; case BPF_FUNC_xdp_get_buff_len: return &bpf_xdp_get_buff_len_trace_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_printf_proto : NULL; case BPF_FUNC_seq_write: return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; case BPF_FUNC_seq_printf_btf: return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_printf_btf_proto : NULL; case BPF_FUNC_d_path: return &bpf_d_path_proto; case BPF_FUNC_get_func_arg: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; case BPF_FUNC_get_func_ret: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; case BPF_FUNC_get_attach_cookie: if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) return &bpf_get_attach_cookie_proto_tracing; return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) fn = bpf_iter_get_func_proto(func_id, prog); return fn; } } static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { return bpf_tracing_ctx_access(off, size, type); } static bool tracing_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { return bpf_tracing_btf_ctx_access(off, size, type, prog, info); } int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { return -ENOTSUPP; } const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { .get_func_proto = raw_tp_prog_func_proto, .is_valid_access = raw_tp_prog_is_valid_access, }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { #ifdef CONFIG_NET .test_run = bpf_prog_test_run_raw_tp, #endif }; const struct bpf_verifier_ops tracing_verifier_ops = { .get_func_proto = tracing_prog_func_proto, .is_valid_access = tracing_prog_is_valid_access, }; const struct bpf_prog_ops tracing_prog_ops = { .test_run = bpf_prog_test_run_tracing, }; static bool raw_tp_writable_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off == 0) { if (size != sizeof(u64) || type != BPF_READ) return false; info->reg_type = PTR_TO_TP_BUFFER; } return raw_tp_prog_is_valid_access(off, size, type, prog, info); } const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { .get_func_proto = raw_tp_prog_func_proto, .is_valid_access = raw_tp_writable_prog_is_valid_access, }; const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; if (type != BPF_READ) return false; if (off % size != 0) { if (sizeof(unsigned long) != 4) return false; if (size != 8) return false; if (off % size != 4) return false; } switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): bpf_ctx_record_field_size(info, size_u64); if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; case bpf_ctx_range(struct bpf_perf_event_data, addr): bpf_ctx_record_field_size(info, size_u64); if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: if (size != sizeof(long)) return false; } return true; } static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; case offsetof(struct bpf_perf_event_data, addr): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct perf_sample_data, addr, 8, target_size)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, regs)); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, si->off); break; } return insn - insn_buf; } const struct bpf_verifier_ops perf_event_verifier_ops = { .get_func_proto = pe_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; const struct bpf_prog_ops perf_event_prog_ops = { }; static DEFINE_MUTEX(bpf_event_mutex); #define BPF_TRACE_MAX_PROGS 64 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret = -EEXIST; /* * Kprobe override only works if they are on the function entry, * and only if they are on the opt-in list. */ if (prog->kprobe_override && (!trace_kprobe_on_func_entry(event->tp_event) || !trace_kprobe_error_injectable(event->tp_event))) return -EINVAL; mutex_lock(&bpf_event_mutex); if (event->prog) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; goto unlock; } ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array); if (ret < 0) goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; event->bpf_cookie = bpf_cookie; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free_sleepable(old_array); unlock: mutex_unlock(&bpf_event_mutex); return ret; } void perf_event_detach_bpf_prog(struct perf_event *event) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; struct bpf_prog *prog = NULL; int ret; mutex_lock(&bpf_event_mutex); if (!event->prog) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (!old_array) goto put; ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free_sleepable(old_array); } put: prog = event->prog; event->prog = NULL; unlock: mutex_unlock(&bpf_event_mutex); if (prog) { /* * It could be that the bpf_prog is not sleepable (and will be freed * via normal RCU), but is called from a point that supports sleepable * programs and uses tasks-trace-RCU. */ synchronize_rcu_tasks_trace(); bpf_prog_put(prog); } } int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; struct bpf_prog_array *progs; u32 *ids, prog_cnt, ids_len; int ret; if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; ids_len = query.ids_len; if (ids_len > BPF_TRACE_MAX_PROGS) return -E2BIG; ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; /* * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which * is required when user only wants to check for uquery->prog_cnt. * There is no need to check for it since the case is handled * gracefully in bpf_prog_array_copy_info. */ mutex_lock(&bpf_event_mutex); progs = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); mutex_unlock(&bpf_event_mutex); if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) ret = -EFAULT; kfree(ids); return ret; } extern struct bpf_raw_event_map __start__bpf_raw_tp[]; extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { struct bpf_raw_event_map *btp = __start__bpf_raw_tp; for (; btp < __stop__bpf_raw_tp; btp++) { if (!strcmp(btp->tp->name, name)) return btp; } return bpf_get_raw_tracepoint_module(name); } void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { struct module *mod; guard(rcu)(); mod = __module_address((unsigned long)btp); module_put(mod); } static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { struct bpf_prog *prog = link->link.prog; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); goto out; } run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); rcu_read_lock(); (void) bpf_prog_run(prog, args); rcu_read_unlock(); bpf_reset_run_ctx(old_run_ctx); out: this_cpu_dec(*(prog->active)); } #define UNPACK(...) __VA_ARGS__ #define REPEAT_1(FN, DL, X, ...) FN(X) #define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) #define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) #define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) #define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) #define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) #define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) #define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) #define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) #define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) #define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) #define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) #define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) #define SARG(X) u64 arg##X #define COPY(X) args[X] = arg##X #define __DL_COM (,) #define __DL_SEM (;) #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 #define BPF_TRACE_DEFN_x(x) \ void bpf_trace_run##x(struct bpf_raw_tp_link *link, \ REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ { \ u64 args[x]; \ REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ __bpf_trace_run(link, args); \ } \ EXPORT_SYMBOL_GPL(bpf_trace_run##x) BPF_TRACE_DEFN_x(1); BPF_TRACE_DEFN_x(2); BPF_TRACE_DEFN_x(3); BPF_TRACE_DEFN_x(4); BPF_TRACE_DEFN_x(5); BPF_TRACE_DEFN_x(6); BPF_TRACE_DEFN_x(7); BPF_TRACE_DEFN_x(8); BPF_TRACE_DEFN_x(9); BPF_TRACE_DEFN_x(10); BPF_TRACE_DEFN_x(11); BPF_TRACE_DEFN_x(12); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { struct tracepoint *tp = btp->tp; struct bpf_prog *prog = link->link.prog; /* * check that program doesn't access arguments beyond what's * available in this tracepoint */ if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; if (prog->aux->max_tp_access > btp->writable_size) return -EINVAL; return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link); } int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link) { return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link); } int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr, unsigned long *missed) { bool is_tracepoint, is_syscall_tp; struct bpf_prog *prog; int flags, err = 0; prog = event->prog; if (!prog) return -ENOENT; /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) return -EOPNOTSUPP; *prog_id = prog->aux->id; flags = event->tp_event->flags; is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); if (is_tracepoint || is_syscall_tp) { *buf = is_tracepoint ? event->tp_event->tp->name : event->tp_event->name; /* We allow NULL pointer for tracepoint */ if (fd_type) *fd_type = BPF_FD_TYPE_TRACEPOINT; if (probe_offset) *probe_offset = 0x0; if (probe_addr) *probe_addr = 0x0; } else { /* kprobe/uprobe */ err = -EOPNOTSUPP; #ifdef CONFIG_KPROBE_EVENTS if (flags & TRACE_EVENT_FL_KPROBE) err = bpf_get_kprobe_info(event, fd_type, buf, probe_offset, probe_addr, missed, event->attr.type == PERF_TYPE_TRACEPOINT); #endif #ifdef CONFIG_UPROBE_EVENTS if (flags & TRACE_EVENT_FL_UPROBE) err = bpf_get_uprobe_info(event, fd_type, buf, probe_offset, probe_addr, event->attr.type == PERF_TYPE_TRACEPOINT); #endif } return err; } static int __init send_signal_irq_work_init(void) { int cpu; struct send_signal_irq_work *work; for_each_possible_cpu(cpu) { work = per_cpu_ptr(&send_signal_work, cpu); init_irq_work(&work->irq_work, do_bpf_send_signal); } return 0; } subsys_initcall(send_signal_irq_work_init); #ifdef CONFIG_MODULES static int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) { struct bpf_trace_module *btm, *tmp; struct module *mod = module; int ret = 0; if (mod->num_bpf_raw_events == 0 || (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) goto out; mutex_lock(&bpf_module_mutex); switch (op) { case MODULE_STATE_COMING: btm = kzalloc(sizeof(*btm), GFP_KERNEL); if (btm) { btm->module = module; list_add(&btm->list, &bpf_trace_modules); } else { ret = -ENOMEM; } break; case MODULE_STATE_GOING: list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { if (btm->module == module) { list_del(&btm->list); kfree(btm); break; } } break; } mutex_unlock(&bpf_module_mutex); out: return notifier_from_errno(ret); } static struct notifier_block bpf_module_nb = { .notifier_call = bpf_event_notify, }; static int __init bpf_event_init(void) { register_module_notifier(&bpf_module_nb); return 0; } fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ struct bpf_session_run_ctx { struct bpf_run_ctx run_ctx; bool is_return; void *data; }; #ifdef CONFIG_FPROBE struct bpf_kprobe_multi_link { struct bpf_link link; struct fprobe fp; unsigned long *addrs; u64 *cookies; u32 cnt; u32 mods_cnt; struct module **mods; }; struct bpf_kprobe_multi_run_ctx { struct bpf_session_run_ctx session_ctx; struct bpf_kprobe_multi_link *link; unsigned long entry_ip; }; struct user_syms { const char **syms; char *buf; }; #ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs); #define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs) #else #define bpf_kprobe_multi_pt_regs_ptr() (NULL) #endif static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip) { unsigned long ip = ftrace_get_symaddr(fentry_ip); return ip ? : fentry_ip; } static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) { unsigned long __user usymbol; const char **syms = NULL; char *buf = NULL, *p; int err = -ENOMEM; unsigned int i; syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); if (!syms) goto error; buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); if (!buf) goto error; for (p = buf, i = 0; i < cnt; i++) { if (__get_user(usymbol, usyms + i)) { err = -EFAULT; goto error; } err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN); if (err == KSYM_NAME_LEN) err = -E2BIG; if (err < 0) goto error; syms[i] = p; p += err + 1; } us->syms = syms; us->buf = buf; return 0; error: if (err) { kvfree(syms); kvfree(buf); } return err; } static void kprobe_multi_put_modules(struct module **mods, u32 cnt) { u32 i; for (i = 0; i < cnt; i++) module_put(mods[i]); } static void free_user_syms(struct user_syms *us) { kvfree(us->syms); kvfree(us->buf); } static void bpf_kprobe_multi_link_release(struct bpf_link *link) { struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); unregister_fprobe(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) { struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); kvfree(kmulti_link->addrs); kvfree(kmulti_link->cookies); kfree(kmulti_link->mods); kfree(kmulti_link); } static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { u64 __user *ucookies = u64_to_user_ptr(info->kprobe_multi.cookies); u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs); struct bpf_kprobe_multi_link *kmulti_link; u32 ucount = info->kprobe_multi.count; int err = 0, i; if (!uaddrs ^ !ucount) return -EINVAL; if (ucookies && !ucount) return -EINVAL; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) return 0; if (ucount < kmulti_link->cnt) err = -ENOSPC; else ucount = kmulti_link->cnt; if (ucookies) { if (kmulti_link->cookies) { if (copy_to_user(ucookies, kmulti_link->cookies, ucount * sizeof(u64))) return -EFAULT; } else { for (i = 0; i < ucount; i++) { if (put_user(0, ucookies + i)) return -EFAULT; } } } if (kallsyms_show_value(current_cred())) { if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64))) return -EFAULT; } else { for (i = 0; i < ucount; i++) { if (put_user(0, uaddrs + i)) return -EFAULT; } } return err; } #ifdef CONFIG_PROC_FS static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); seq_printf(seq, "kprobe_cnt:\t%u\n" "missed:\t%lu\n", kmulti_link->cnt, kmulti_link->fp.nmissed); seq_printf(seq, "%s\t %s\n", "cookie", "func"); for (int i = 0; i < kmulti_link->cnt; i++) { seq_printf(seq, "%llu\t %pS\n", kmulti_link->cookies[i], (void *)kmulti_link->addrs[i]); } } #endif static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_kprobe_multi_show_fdinfo, #endif }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) { const struct bpf_kprobe_multi_link *link = priv; unsigned long *addr_a = a, *addr_b = b; u64 *cookie_a, *cookie_b; cookie_a = link->cookies + (addr_a - link->addrs); cookie_b = link->cookies + (addr_b - link->addrs); /* swap addr_a/addr_b and cookie_a/cookie_b values */ swap(*addr_a, *addr_b); swap(*cookie_a, *cookie_b); } static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b) { const unsigned long *addr_a = a, *addr_b = b; if (*addr_a == *addr_b) return 0; return *addr_a < *addr_b ? -1 : 1; } static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) { return bpf_kprobe_multi_addrs_cmp(a, b); } static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; struct bpf_kprobe_multi_link *link; u64 *cookie, entry_ip; unsigned long *addr; if (WARN_ON_ONCE(!ctx)) return 0; run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, session_ctx.run_ctx); link = run_ctx->link; if (!link->cookies) return 0; entry_ip = run_ctx->entry_ip; addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), bpf_kprobe_multi_addrs_cmp); if (!addr) return 0; cookie = link->cookies + (addr - link->addrs); return *cookie; } static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_kprobe_multi_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, session_ctx.run_ctx); return run_ctx->entry_ip; } static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { .session_ctx = { .is_return = is_return, .data = data, }, .link = link, .entry_ip = entry_ip, }; struct bpf_run_ctx *old_run_ctx; struct pt_regs *regs; int err; /* * graph tracer framework ensures we won't migrate, so there is no need * to use migrate_disable for bpf_prog_run again. The check here just for * __this_cpu_inc_return. */ cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); err = 1; goto out; } rcu_read_lock(); regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); return err; } static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), fregs, false, data); return is_kprobe_session(link->link.prog) ? err : 0; } static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), fregs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) { const char **str_a = (const char **) a; const char **str_b = (const char **) b; return strcmp(*str_a, *str_b); } struct multi_symbols_sort { const char **funcs; u64 *cookies; }; static void symbols_swap_r(void *a, void *b, int size, const void *priv) { const struct multi_symbols_sort *data = priv; const char **name_a = a, **name_b = b; swap(*name_a, *name_b); /* If defined, swap also related cookies. */ if (data->cookies) { u64 *cookie_a, *cookie_b; cookie_a = data->cookies + (name_a - data->funcs); cookie_b = data->cookies + (name_b - data->funcs); swap(*cookie_a, *cookie_b); } } struct modules_array { struct module **mods; int mods_cnt; int mods_cap; }; static int add_module(struct modules_array *arr, struct module *mod) { struct module **mods; if (arr->mods_cnt == arr->mods_cap) { arr->mods_cap = max(16, arr->mods_cap * 3 / 2); mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL); if (!mods) return -ENOMEM; arr->mods = mods; } arr->mods[arr->mods_cnt] = mod; arr->mods_cnt++; return 0; } static bool has_module(struct modules_array *arr, struct module *mod) { int i; for (i = arr->mods_cnt - 1; i >= 0; i--) { if (arr->mods[i] == mod) return true; } return false; } static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt) { struct modules_array arr = {}; u32 i, err = 0; for (i = 0; i < addrs_cnt; i++) { bool skip_add = false; struct module *mod; scoped_guard(rcu) { mod = __module_address(addrs[i]); /* Either no module or it's already stored */ if (!mod || has_module(&arr, mod)) { skip_add = true; break; /* scoped_guard */ } if (!try_module_get(mod)) err = -EINVAL; } if (skip_add) continue; if (err) break; err = add_module(&arr, mod); if (err) { module_put(mod); break; } } /* We return either err < 0 in case of error, ... */ if (err) { kprobe_multi_put_modules(arr.mods, arr.mods_cnt); kfree(arr.mods); return err; } /* or number of modules found if everything is ok. */ *mods = arr.mods; return arr.mods_cnt; } static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt) { u32 i; for (i = 0; i < cnt; i++) { if (!within_error_injection_list(addrs[i])) return -EINVAL; } return 0; } int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; struct bpf_link_primer link_primer; void __user *ucookies; unsigned long *addrs; u32 flags, cnt, size; void __user *uaddrs; u64 *cookies = NULL; void __user *usyms; int err; /* no support for 32bit archs yet */ if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; if (attr->link_create.flags) return -EINVAL; if (!is_kprobe_multi(prog)) return -EINVAL; /* Writing to context is not allowed for kprobes. */ if (prog->aux->kprobe_write_ctx) return -EINVAL; flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs); usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms); if (!!uaddrs == !!usyms) return -EINVAL; cnt = attr->link_create.kprobe_multi.cnt; if (!cnt) return -EINVAL; if (cnt > MAX_KPROBE_MULTI_CNT) return -E2BIG; size = cnt * sizeof(*addrs); addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); if (ucookies) { cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); if (!cookies) { err = -ENOMEM; goto error; } if (copy_from_user(cookies, ucookies, size)) { err = -EFAULT; goto error; } } if (uaddrs) { if (copy_from_user(addrs, uaddrs, size)) { err = -EFAULT; goto error; } } else { struct multi_symbols_sort data = { .cookies = cookies, }; struct user_syms us; err = copy_user_syms(&us, usyms, cnt); if (err) goto error; if (cookies) data.funcs = us.syms; sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, symbols_swap_r, &data); err = ftrace_lookup_symbols(us.syms, cnt, addrs); free_user_syms(&us); if (err) goto error; } if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) { err = -EINVAL; goto error; } link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; goto error; } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) goto error; if (!(flags & BPF_F_KPROBE_MULTI_RETURN)) link->fp.entry_handler = kprobe_multi_link_handler; if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog)) link->fp.exit_handler = kprobe_multi_link_exit_handler; if (is_kprobe_session(prog)) link->fp.entry_data_size = sizeof(u64); link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; link->link.flags = flags; if (cookies) { /* * Sorting addresses will trigger sorting cookies as well * (check bpf_kprobe_multi_cookie_swap). This way we can * find cookie based on the address in bpf_get_attach_cookie * helper. */ sort_r(addrs, cnt, sizeof(*addrs), bpf_kprobe_multi_cookie_cmp, bpf_kprobe_multi_cookie_swap, link); } err = get_modules_for_addrs(&link->mods, addrs, cnt); if (err < 0) { bpf_link_cleanup(&link_primer); return err; } link->mods_cnt = err; err = register_fprobe_ips(&link->fp, addrs, cnt); if (err) { kprobe_multi_put_modules(link->mods, link->mods_cnt); bpf_link_cleanup(&link_primer); return err; } return bpf_link_settle(&link_primer); error: kfree(link); kvfree(addrs); kvfree(cookies); return err; } #else /* !CONFIG_FPROBE */ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) { return 0; } static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { return 0; } #endif #ifdef CONFIG_UPROBES struct bpf_uprobe_multi_link; struct bpf_uprobe { struct bpf_uprobe_multi_link *link; loff_t offset; unsigned long ref_ctr_offset; u64 cookie; struct uprobe *uprobe; struct uprobe_consumer consumer; bool session; }; struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; struct bpf_uprobe *uprobes; struct task_struct *task; }; struct bpf_uprobe_multi_run_ctx { struct bpf_session_run_ctx session_ctx; unsigned long entry_ip; struct bpf_uprobe *uprobe; }; static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i; for (i = 0; i < cnt; i++) uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); if (cnt) uprobe_unregister_sync(); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) { struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); if (umulti_link->task) put_task_struct(umulti_link->task); path_put(&umulti_link->path); } static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) { struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); kvfree(umulti_link->uprobes); kfree(umulti_link); } static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets); u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies); u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets); u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path); u32 upath_size = info->uprobe_multi.path_size; struct bpf_uprobe_multi_link *umulti_link; u32 ucount = info->uprobe_multi.count; int err = 0, i; char *p, *buf; long left = 0; if (!upath ^ !upath_size) return -EINVAL; if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount) return -EINVAL; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; upath_size = upath_size ? min_t(u32, upath_size, PATH_MAX) : PATH_MAX; buf = kmalloc(upath_size, GFP_KERNEL); if (!buf) return -ENOMEM; p = d_path(&umulti_link->path, buf, upath_size); if (IS_ERR(p)) { kfree(buf); return PTR_ERR(p); } upath_size = buf + upath_size - p; if (upath) left = copy_to_user(upath, p, upath_size); kfree(buf); if (left) return -EFAULT; info->uprobe_multi.path_size = upath_size; if (!uoffsets && !ucookies && !uref_ctr_offsets) return 0; if (ucount < umulti_link->cnt) err = -ENOSPC; else ucount = umulti_link->cnt; for (i = 0; i < ucount; i++) { if (uoffsets && put_user(umulti_link->uprobes[i].offset, uoffsets + i)) return -EFAULT; if (uref_ctr_offsets && put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) return -EFAULT; if (ucookies && put_user(umulti_link->uprobes[i].cookie, ucookies + i)) return -EFAULT; } return err; } #ifdef CONFIG_PROC_FS static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_uprobe_multi_link *umulti_link; char *p, *buf; pid_t pid; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return; p = d_path(&umulti_link->path, buf, PATH_MAX); if (IS_ERR(p)) { kfree(buf); return; } pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; seq_printf(seq, "uprobe_cnt:\t%u\n" "pid:\t%u\n" "path:\t%s\n", umulti_link->cnt, pid, p); seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset"); for (int i = 0; i < umulti_link->cnt; i++) { seq_printf(seq, "%llu\t %#llx\t %#lx\n", umulti_link->uprobes[i].cookie, umulti_link->uprobes[i].offset, umulti_link->uprobes[i].ref_ctr_offset); } kfree(buf); } #endif static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_uprobe_multi_show_fdinfo, #endif }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, unsigned long entry_ip, struct pt_regs *regs, bool is_return, void *data) { struct bpf_uprobe_multi_link *link = uprobe->link; struct bpf_uprobe_multi_run_ctx run_ctx = { .session_ctx = { .is_return = is_return, .data = data, }, .entry_ip = entry_ip, .uprobe = uprobe, }; struct bpf_prog *prog = link->link.prog; bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; int err; if (link->task && !same_thread_group(current, link->task)) return 0; if (sleepable) rcu_read_lock_trace(); else rcu_read_lock(); migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); if (sleepable) rcu_read_unlock_trace(); else rcu_read_unlock(); return err; } static bool uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) { struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); return uprobe->link->task->mm == mm; } static int uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, __u64 *data) { struct bpf_uprobe *uprobe; int ret; uprobe = container_of(con, struct bpf_uprobe, consumer); ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data); if (uprobe->session) return ret ? UPROBE_HANDLER_IGNORE : 0; return 0; } static int uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, __u64 *data) { struct bpf_uprobe *uprobe; uprobe = container_of(con, struct bpf_uprobe, consumer); uprobe_prog_run(uprobe, func, regs, true, data); return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, session_ctx.run_ctx); return run_ctx->entry_ip; } static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) { struct bpf_uprobe_multi_run_ctx *run_ctx; run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, session_ctx.run_ctx); return run_ctx->uprobe->cookie; } int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; unsigned long __user *uref_ctr_offsets; struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; void __user *upath; u32 flags, cnt, i; struct path path; char *name; pid_t pid; int err; /* no support for 32bit archs yet */ if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; if (attr->link_create.flags) return -EINVAL; if (!is_uprobe_multi(prog)) return -EINVAL; flags = attr->link_create.uprobe_multi.flags; if (flags & ~BPF_F_UPROBE_MULTI_RETURN) return -EINVAL; /* * path, offsets and cnt are mandatory, * ref_ctr_offsets and cookies are optional */ upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; pid = attr->link_create.uprobe_multi.pid; if (!upath || !uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); name = strndup_user(upath, PATH_MAX); if (IS_ERR(name)) { err = PTR_ERR(name); return err; } err = kern_path(name, LOOKUP_FOLLOW, &path); kfree(name); if (err) return err; if (!d_is_reg(path.dentry)) { err = -EBADF; goto error_path_put; } if (pid) { rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; } } err = -ENOMEM; link = kzalloc(sizeof(*link), GFP_KERNEL); uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL); if (!uprobes || !link) goto error_free; for (i = 0; i < cnt; i++) { if (__get_user(uprobes[i].offset, uoffsets + i)) { err = -EFAULT; goto error_free; } if (uprobes[i].offset < 0) { err = -EINVAL; goto error_free; } if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; } if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { err = -EFAULT; goto error_free; } uprobes[i].link = link; if (!(flags & BPF_F_UPROBE_MULTI_RETURN)) uprobes[i].consumer.handler = uprobe_multi_link_handler; if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog)) uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; if (is_uprobe_session(prog)) uprobes[i].session = true; if (pid) uprobes[i].consumer.filter = uprobe_multi_link_filter; } link->cnt = cnt; link->uprobes = uprobes; link->path = path; link->task = task; link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), uprobes[i].offset, uprobes[i].ref_ctr_offset, &uprobes[i].consumer); if (IS_ERR(uprobes[i].uprobe)) { err = PTR_ERR(uprobes[i].uprobe); link->cnt = i; goto error_unregister; } } err = bpf_link_prime(&link->link, &link_primer); if (err) goto error_unregister; return bpf_link_settle(&link_primer); error_unregister: bpf_uprobe_unregister(uprobes, link->cnt); error_free: kvfree(uprobes); kfree(link); if (task) put_task_struct(task); error_path_put: path_put(&path); return err; } #else /* !CONFIG_UPROBES */ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) { return 0; } static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) { return 0; } #endif /* CONFIG_UPROBES */ __bpf_kfunc_start_defs(); __bpf_kfunc bool bpf_session_is_return(void) { struct bpf_session_run_ctx *session_ctx; session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); return session_ctx->is_return; } __bpf_kfunc __u64 *bpf_session_cookie(void) { struct bpf_session_run_ctx *session_ctx; session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx); return session_ctx->data; } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) BTF_ID_FLAGS(func, bpf_session_cookie) BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) { if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) return 0; if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) return -EACCES; return 0; } static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { .owner = THIS_MODULE, .set = &kprobe_multi_kfunc_set_ids, .filter = bpf_kprobe_multi_filter, }; static int __init bpf_kprobe_multi_kfuncs_init(void) { return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); } late_initcall(bpf_kprobe_multi_kfuncs_init); typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); /* * The __always_inline is to make sure the compiler doesn't * generate indirect calls into callbacks, which is expensive, * on some kernel configurations. This allows compiler to put * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; u32 chunk_sz, off; void *dst_slice; int cnt, err; char buf[256]; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); if (likely(dst_slice)) return str_copy_fn(dst_slice, unsafe_src, size, tsk); dst = (struct bpf_dynptr_kern *)dptr; if (bpf_dynptr_check_off_len(dst, doff, size)) return -E2BIG; for (off = 0; off < size; off += chunk_sz - 1) { chunk_sz = min_t(u32, sizeof(buf), size - off); /* Expect str_copy_fn to return count of copied bytes, including * zero terminator. Next iteration increment off by chunk_sz - 1 to * overwrite NUL. */ cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (cnt < 0) return cnt; err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); if (err) return err; if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ return off + cnt; } return off; } static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, u32 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; u32 off, chunk_sz; int err; dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); if (likely(dst_slice)) return copy_fn(dst_slice, unsafe_src, size, tsk); dst = (struct bpf_dynptr_kern *)dptr; if (bpf_dynptr_check_off_len(dst, doff, size)) return -E2BIG; for (off = 0; off < size; off += chunk_sz) { chunk_sz = min_t(u32, sizeof(buf), size - off); err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); if (err) return err; err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); if (err) return err; } return 0; } static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); } static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { int ret; if (!tsk) { /* Read from the current task */ ret = copy_from_user(dst, (const void __user *)unsafe_src, size); if (ret) return -EFAULT; return 0; } ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); if (ret != size) return -EFAULT; return 0; } static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return copy_from_kernel_nofault(dst, unsafe_src, size); } static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); } static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { int ret; if (unlikely(size == 0)) return 0; if (tsk) { ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); } else { ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); /* strncpy_from_user does not guarantee NUL termination */ if (ret >= 0) ((char *)dst)[ret] = '\0'; } if (ret < 0) return ret; return ret + 1; } static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, u32 size, struct task_struct *tsk) { return strncpy_from_kernel_nofault(dst, unsafe_src, size); } __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, u64 value) { if (type != PIDTYPE_PID && type != PIDTYPE_TGID) return -EINVAL; return bpf_send_signal_common(sig, type, task, value); } __bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } __bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } __bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } __bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, u32 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, copy_user_str_sleepable, tsk); } __bpf_kfunc_end_defs();
2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 // SPDX-License-Identifier: GPL-2.0-only /* * stack_o2cb.c * * Code which interfaces ocfs2 with the o2cb stack. * * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/kernel.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/module.h> /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ #include <linux/fs.h> #include "cluster/masklog.h" #include "cluster/nodemanager.h" #include "cluster/heartbeat.h" #include "cluster/tcp.h" #include "stackglue.h" struct o2dlm_private { struct dlm_eviction_cb op_eviction_cb; }; static struct ocfs2_stack_plugin o2cb_stack; /* These should be identical */ #if (DLM_LOCK_IV != LKM_IVMODE) # error Lock modes do not match #endif #if (DLM_LOCK_NL != LKM_NLMODE) # error Lock modes do not match #endif #if (DLM_LOCK_CR != LKM_CRMODE) # error Lock modes do not match #endif #if (DLM_LOCK_CW != LKM_CWMODE) # error Lock modes do not match #endif #if (DLM_LOCK_PR != LKM_PRMODE) # error Lock modes do not match #endif #if (DLM_LOCK_PW != LKM_PWMODE) # error Lock modes do not match #endif #if (DLM_LOCK_EX != LKM_EXMODE) # error Lock modes do not match #endif static inline int mode_to_o2dlm(int mode) { BUG_ON(mode > LKM_MAXMODE); return mode; } static int flags_to_o2dlm(u32 flags) { int o2dlm_flags = 0; if (flags & DLM_LKF_NOQUEUE) o2dlm_flags |= LKM_NOQUEUE; if (flags & DLM_LKF_CANCEL) o2dlm_flags |= LKM_CANCEL; if (flags & DLM_LKF_CONVERT) o2dlm_flags |= LKM_CONVERT; if (flags & DLM_LKF_VALBLK) o2dlm_flags |= LKM_VALBLK; if (flags & DLM_LKF_IVVALBLK) o2dlm_flags |= LKM_INVVALBLK; if (flags & DLM_LKF_ORPHAN) o2dlm_flags |= LKM_ORPHAN; if (flags & DLM_LKF_FORCEUNLOCK) o2dlm_flags |= LKM_FORCE; if (flags & DLM_LKF_TIMEOUT) o2dlm_flags |= LKM_TIMEOUT; if (flags & DLM_LKF_LOCAL) o2dlm_flags |= LKM_LOCAL; return o2dlm_flags; } /* * Map an o2dlm status to standard errno values. * * o2dlm only uses a handful of these, and returns even fewer to the * caller. Still, we try to assign sane values to each error. * * The following value pairs have special meanings to dlmglue, thus * the right hand side needs to stay unique - never duplicate the * mapping elsewhere in the table! * * DLM_NORMAL: 0 * DLM_NOTQUEUED: -EAGAIN * DLM_CANCELGRANT: -EBUSY * DLM_CANCEL: -DLM_ECANCEL */ /* Keep in sync with dlmapi.h */ static int status_map[] = { [DLM_NORMAL] = 0, /* Success */ [DLM_GRANTED] = -EINVAL, [DLM_DENIED] = -EACCES, [DLM_DENIED_NOLOCKS] = -EACCES, [DLM_WORKING] = -EACCES, [DLM_BLOCKED] = -EINVAL, [DLM_BLOCKED_ORPHAN] = -EINVAL, [DLM_DENIED_GRACE_PERIOD] = -EACCES, [DLM_SYSERR] = -ENOMEM, /* It is what it is */ [DLM_NOSUPPORT] = -EPROTO, [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */ [DLM_IVLOCKID] = -EINVAL, [DLM_SYNC] = -EINVAL, [DLM_BADTYPE] = -EINVAL, [DLM_BADRESOURCE] = -EINVAL, [DLM_MAXHANDLES] = -ENOMEM, [DLM_NOCLINFO] = -EINVAL, [DLM_NOLOCKMGR] = -EINVAL, [DLM_NOPURGED] = -EINVAL, [DLM_BADARGS] = -EINVAL, [DLM_VOID] = -EINVAL, [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */ [DLM_IVBUFLEN] = -EINVAL, [DLM_CVTUNGRANT] = -EPERM, [DLM_BADPARAM] = -EINVAL, [DLM_VALNOTVALID] = -EINVAL, [DLM_REJECTED] = -EPERM, [DLM_ABORT] = -EINVAL, [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */ [DLM_IVRESHANDLE] = -EINVAL, [DLM_DEADLOCK] = -EDEADLK, [DLM_DENIED_NOASTS] = -EINVAL, [DLM_FORWARD] = -EINVAL, [DLM_TIMEOUT] = -ETIMEDOUT, [DLM_IVGROUPID] = -EINVAL, [DLM_VERS_CONFLICT] = -EOPNOTSUPP, [DLM_BAD_DEVICE_PATH] = -ENOENT, [DLM_NO_DEVICE_PERMISSION] = -EPERM, [DLM_NO_CONTROL_DEVICE] = -ENOENT, [DLM_RECOVERING] = -ENOTCONN, [DLM_MIGRATING] = -ERESTART, [DLM_MAXSTATS] = -EINVAL, }; static int dlm_status_to_errno(enum dlm_status status) { BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); return status_map[status]; } static void o2dlm_lock_ast_wrapper(void *astarg) { struct ocfs2_dlm_lksb *lksb = astarg; lksb->lksb_conn->cc_proto->lp_lock_ast(lksb); } static void o2dlm_blocking_ast_wrapper(void *astarg, int level) { struct ocfs2_dlm_lksb *lksb = astarg; lksb->lksb_conn->cc_proto->lp_blocking_ast(lksb, level); } static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) { struct ocfs2_dlm_lksb *lksb = astarg; int error = dlm_status_to_errno(status); /* * In o2dlm, you can get both the lock_ast() for the lock being * granted and the unlock_ast() for the CANCEL failing. A * successful cancel sends DLM_NORMAL here. If the * lock grant happened before the cancel arrived, you get * DLM_CANCELGRANT. * * There's no need for the double-ast. If we see DLM_CANCELGRANT, * we just ignore it. We expect the lock_ast() to handle the * granted lock. */ if (status == DLM_CANCELGRANT) return; lksb->lksb_conn->cc_proto->lp_unlock_ast(lksb, error); } static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) { enum dlm_status status; int o2dlm_mode = mode_to_o2dlm(mode); int o2dlm_flags = flags_to_o2dlm(flags); int ret; status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, o2dlm_flags, name, namelen, o2dlm_lock_ast_wrapper, lksb, o2dlm_blocking_ast_wrapper); ret = dlm_status_to_errno(status); return ret; } static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { enum dlm_status status; int o2dlm_flags = flags_to_o2dlm(flags); int ret; status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, o2dlm_flags, o2dlm_unlock_ast_wrapper, lksb); ret = dlm_status_to_errno(status); return ret; } static int o2cb_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return dlm_status_to_errno(lksb->lksb_o2dlm.status); } /* * o2dlm always has a "valid" LVB. If the dlm loses track of the LVB * contents, it will zero out the LVB. Thus the caller can always trust * the contents. */ static int o2cb_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return 1; } static void *o2cb_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return (void *)(lksb->lksb_o2dlm.lvb); } static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) { dlm_print_one_lock(lksb->lksb_o2dlm.lockid); } /* * Check if this node is heartbeating and is connected to all other * heartbeating nodes. */ static int o2cb_cluster_check(void) { u8 node_num; int i; unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; node_num = o2nm_this_node(); if (node_num == O2NM_MAX_NODES) { printk(KERN_ERR "o2cb: This node has not been configured.\n"); return -EINVAL; } /* * o2dlm expects o2net sockets to be created. If not, then * dlm_join_domain() fails with a stack of errors which are both cryptic * and incomplete. The idea here is to detect upfront whether we have * managed to connect to all nodes or not. If not, then list the nodes * to allow the user to check the configuration (incorrect IP, firewall, * etc.) Yes, this is racy. But its not the end of the world. */ #define O2CB_MAP_STABILIZE_COUNT 60 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { o2hb_fill_node_map(hbmap, O2NM_MAX_NODES); if (!test_bit(node_num, hbmap)) { printk(KERN_ERR "o2cb: %s heartbeat has not been " "started.\n", (o2hb_global_heartbeat_active() ? "Global" : "Local")); return -EINVAL; } o2net_fill_node_map(netmap, O2NM_MAX_NODES); /* Force set the current node to allow easy compare */ set_bit(node_num, netmap); if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES)) return 0; if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); } printk(KERN_ERR "o2cb: This node could not connect to nodes:"); i = -1; while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { if (!test_bit(i, netmap)) printk(" %u", i); } printk(".\n"); return -ENOTCONN; } /* * Called from the dlm when it's about to evict a node. This is how the * classic stack signals node death. */ static void o2dlm_eviction_cb(int node_num, void *data) { struct ocfs2_cluster_connection *conn = data; printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", node_num, conn->cc_namelen, conn->cc_name); conn->cc_recovery_handler(node_num, conn->cc_recovery_data); } static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) { int rc = 0; u32 dlm_key; struct dlm_ctxt *dlm; struct o2dlm_private *priv; struct dlm_protocol_version fs_version; BUG_ON(conn == NULL); BUG_ON(conn->cc_proto == NULL); /* Ensure cluster stack is up and all nodes are connected */ rc = o2cb_cluster_check(); if (rc) { printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " "before retrying.\n"); goto out; } priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL); if (!priv) { rc = -ENOMEM; goto out_free; } /* This just fills the structure in. It is safe to pass conn. */ dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb, conn); conn->cc_private = priv; /* used by the dlm code to make message headers unique, each * node in this domain must agree on this. */ dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); fs_version.pv_major = conn->cc_version.pv_major; fs_version.pv_minor = conn->cc_version.pv_minor; dlm = dlm_register_domain(conn->cc_name, dlm_key, &fs_version); if (IS_ERR(dlm)) { rc = PTR_ERR(dlm); mlog_errno(rc); goto out_free; } conn->cc_version.pv_major = fs_version.pv_major; conn->cc_version.pv_minor = fs_version.pv_minor; conn->cc_lockspace = dlm; dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); out_free: if (rc) kfree(conn->cc_private); out: return rc; } static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) { struct dlm_ctxt *dlm = conn->cc_lockspace; struct o2dlm_private *priv = conn->cc_private; dlm_unregister_eviction_cb(&priv->op_eviction_cb); conn->cc_private = NULL; kfree(priv); dlm_unregister_domain(dlm); conn->cc_lockspace = NULL; return 0; } static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, unsigned int *node) { int node_num; node_num = o2nm_this_node(); if (node_num == O2NM_INVALID_NODE_NUM) return -ENOENT; if (node_num >= O2NM_MAX_NODES) return -EOVERFLOW; *node = node_num; return 0; } static const struct ocfs2_stack_operations o2cb_stack_ops = { .connect = o2cb_cluster_connect, .disconnect = o2cb_cluster_disconnect, .this_node = o2cb_cluster_this_node, .dlm_lock = o2cb_dlm_lock, .dlm_unlock = o2cb_dlm_unlock, .lock_status = o2cb_dlm_lock_status, .lvb_valid = o2cb_dlm_lvb_valid, .lock_lvb = o2cb_dlm_lvb, .dump_lksb = o2cb_dump_lksb, }; static struct ocfs2_stack_plugin o2cb_stack = { .sp_name = "o2cb", .sp_ops = &o2cb_stack_ops, .sp_owner = THIS_MODULE, }; static int __init o2cb_stack_init(void) { return ocfs2_stack_glue_register(&o2cb_stack); } static void __exit o2cb_stack_exit(void) { ocfs2_stack_glue_unregister(&o2cb_stack); } MODULE_AUTHOR("Oracle"); MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack"); MODULE_LICENSE("GPL"); module_init(o2cb_stack_init); module_exit(o2cb_stack_exit);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_VMX_HYPERV_H #define __KVM_X86_VMX_HYPERV_H #include <linux/kvm_host.h> #include "vmcs12.h" #include "vmx.h" #define EVMPTR_INVALID (-1ULL) #define EVMPTR_MAP_PENDING (-2ULL) enum nested_evmptrld_status { EVMPTRLD_DISABLED, EVMPTRLD_SUCCEEDED, EVMPTRLD_VMFAIL, EVMPTRLD_ERROR, }; #ifdef CONFIG_KVM_HYPERV static inline bool evmptr_is_valid(u64 evmptr) { return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; } static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx) { return evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); } static inline bool evmptr_is_set(u64 evmptr) { return evmptr != EVMPTR_INVALID; } static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) { return evmptr_is_set(vmx->nested.hv_evmcs_vmptr); } static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) { return vmx->nested.hv_evmcs; } static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu) { /* * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and * eVMCS has been explicitly enabled by userspace. */ return vcpu->arch.hyperv_enabled && to_vmx(vcpu)->nested.enlightened_vmcs_enabled; } u64 nested_get_evmptr(struct kvm_vcpu *vcpu); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int nested_evmcs_check_controls(struct vmcs12 *vmcs12); bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu); void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); #else static inline bool evmptr_is_valid(u64 evmptr) { return false; } static inline bool nested_vmx_is_evmptr12_valid(struct vcpu_vmx *vmx) { return false; } static inline bool evmptr_is_set(u64 evmptr) { return false; } static inline bool nested_vmx_is_evmptr12_set(struct vcpu_vmx *vmx) { return false; } static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) { return NULL; } #endif #endif /* __KVM_X86_VMX_HYPERV_H */
3 3 17 17 3 2 1 3 1 2 2 1 1 3 5 3 9 9 1 4 1 1 1 1 1 1 1 3 24 18 1 1 1 1 1 1 1 1 18 6 1 1 1 1 1 1 3 1 1 1 3 1 7 17 2 1 1 1 3 3 2 7 1 3 3 3 1 16 16 16 16 11 4 142 1 1 1 6 6 3 2 2 1 1 2 2 2 2 19 17 147 17 142 144 2 145 145 3 2 4 3 29 11 4 1 1 1 3 1 2 1 1 2 1 1 1 1 1 1 1 1 6 1 1 61 37 1 1 3 1 1 2 1 3 7 1 1 26 5 4 2 1 17 12 1 1 1 3 3 1 3 1 1 7 41 30 1 1 1 2 1 1 22 1 1 1 90 88 29 61 56 56 17 41 178 175 1 1 1 1 1 1 1 1 1 3 1 1 1 17 39 39 1 1 1 1 1 6 2 2 3 6 6 2 1 3 4 4 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 // SPDX-License-Identifier: GPL-2.0-only /* * KVM Microsoft Hyper-V emulation * * derived from arch/x86/kvm/x86.c * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> #include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> #include <asm/mshyperv.h> #include <trace/events/kvm.h> #include "trace.h" #include "irq.h" #include "fpu.h" #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) /* * As per Hyper-V TLFS, extended hypercalls start from 0x8001 * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value * where each bit tells which extended hypercall is available besides * HvExtCallQueryCapabilities. * * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit * assigned. * * 0x8002 - Bit 0 * 0x8003 - Bit 1 * .. * 0x8041 - Bit 63 * * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 */ #define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); } static inline int synic_get_sint_vector(u64 sint_value) { if (sint_value & HV_SYNIC_SINT_MASKED) return -1; return sint_value & HV_SYNIC_SINT_VECTOR_MASK; } static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, int vector) { int i; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) return true; } return false; } static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, int vector) { int i; u64 sint_value; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { sint_value = synic_read_sint(synic, i); if (synic_get_sint_vector(sint_value) == vector && sint_value & HV_SYNIC_SINT_AUTO_EOI) return true; } return false; } static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); bool auto_eoi_old, auto_eoi_new; if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; if (synic_has_vector_connected(synic, vector)) __set_bit(vector, synic->vec_bitmap); else __clear_bit(vector, synic->vec_bitmap); auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (auto_eoi_old == auto_eoi_new) return; if (!enable_apicv) return; down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) hv->synic_auto_eoi_used++; else hv->synic_auto_eoi_used--; /* * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on * the hypervisor to manually inject IRQs. */ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_HYPERV, !!hv->synic_auto_eoi_used); up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, u64 data, bool host) { int vector, old_vector; bool masked; vector = data & HV_SYNIC_SINT_VECTOR_MASK; masked = data & HV_SYNIC_SINT_MASKED; /* * Valid vectors are 16-255, however, nested Hyper-V attempts to write * default '0x10000' value on boot and this should not #GP. We need to * allow zero-initing the register from host as well. */ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked) return 1; /* * Guest may configure multiple SINTs to use the same vector, so * we maintain a bitmap of vectors handled by synic, and a * bitmap of vectors with auto-eoi behavior. The bitmaps are * updated here, and atomically queried on fast paths. */ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK; atomic64_set(&synic->sint[sint], data); synic_update_vector(synic, old_vector); synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic)); return 0; } static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; return NULL; } static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu || !to_hv_vcpu(vcpu)) return NULL; synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; } static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; int gsi, idx; trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); /* Try to deliver pending Hyper-V SynIC timers messages */ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; if (stimer->msg_pending && stimer->config.enable && !stimer->config.direct_mode && stimer->config.sintx == sint) stimer_mark_pending(stimer, false); } idx = srcu_read_lock(&kvm->irq_srcu); gsi = atomic_read(&synic->sint_to_gsi[sint]); if (gsi != -1) kvm_notify_acked_gsi(kvm, gsi); srcu_read_unlock(&kvm->irq_srcu, idx); } static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; hv_vcpu->exit.u.synic.msr = msr; hv_vcpu->exit.u.synic.control = synic->control; hv_vcpu->exit.u.synic.evt_page = synic->evt_page; hv_vcpu->exit.u.synic.msg_page = synic->msg_page; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: synic->control = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SVERSION: if (!host) { ret = 1; break; } synic->version = data; break; case HV_X64_MSR_SIEFP: if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->evt_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: if ((data & HV_SYNIC_SIMP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->msg_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_EOM: { int i; if (!synic->active) break; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; } case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); break; default: ret = 1; break; } return ret; } static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); return hv_vcpu->cpuid_cache.syndbg_cap_eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; } static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) hv->hv_syndbg.control.status = vcpu->run->hyperv.u.syndbg.status; return 1; } static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; hv_vcpu->exit.u.syndbg.msr = msr; hv_vcpu->exit.u.syndbg.control = syndbg->control.control; hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; vcpu->arch.complete_userspace_io = kvm_hv_syndbg_complete_userspace; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, to_hv_vcpu(vcpu)->vp_index, msr, data); switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: syndbg->control.control = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_STATUS: syndbg->control.status = data; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: syndbg->control.send_page = data; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: syndbg->control.recv_page = data; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: syndbg->control.pending_page = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_OPTIONS: syndbg->options = data; break; default: break; } return 0; } static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: *pdata = syndbg->control.control; break; case HV_X64_MSR_SYNDBG_STATUS: *pdata = syndbg->control.status; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: *pdata = syndbg->control.send_page; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: *pdata = syndbg->control.recv_page; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: *pdata = syndbg->control.pending_page; break; case HV_X64_MSR_SYNDBG_OPTIONS: *pdata = syndbg->options; break; default: break; } trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata); return 0; } static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { int ret; if (!synic->active && !host) return 1; ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: *pdata = synic->control; break; case HV_X64_MSR_SVERSION: *pdata = synic->version; break; case HV_X64_MSR_SIEFP: *pdata = synic->evt_page; break; case HV_X64_MSR_SIMP: *pdata = synic->msg_page; break; case HV_X64_MSR_EOM: *pdata = 0; break; case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); break; default: ret = 1; break; } return ret; } static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_lapic_irq irq; int ret, vector; if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; vector = synic_get_sint_vector(synic_read_sint(synic, sint)); if (vector < 0) return -ENOENT; memset(&irq, 0, sizeof(irq)); irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } int kvm_hv_synic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_vcpu_hv_synic *synic; if (!level) return -1; synic = synic_get(kvm, e->hv_sint.vcpu); if (!synic) return -EINVAL; return synic_set_irq(synic, e->hv_sint.sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); for (i = 0; i < ARRAY_SIZE(synic->sint); i++) if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) kvm_hv_notify_acked_sint(vcpu, i); } static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) return -EINVAL; atomic_set(&synic->sint_to_gsi[sint], gsi); return 0; } void kvm_hv_irq_routing_update(struct kvm *kvm) { struct kvm_irq_routing_table *irq_rt; struct kvm_kernel_irq_routing_entry *e; u32 gsi; irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, lockdep_is_held(&kvm->irq_lock)); for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { hlist_for_each_entry(e, &irq_rt->map[gsi], link) { if (e->type == KVM_IRQ_ROUTING_HV_SINT) kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, e->hv_sint.sint, gsi); } } } static void synic_init(struct kvm_vcpu_hv_synic *synic) { int i; memset(synic, 0, sizeof(*synic)); synic->version = HV_SYNIC_VERSION_1; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); atomic_set(&synic->sint_to_gsi[i], -1); } } static u64 get_time_ref_counter(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; u64 tsc; /* * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, * is broken, disabled or being updated. */ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); tsc = kvm_read_l1_tsc(vcpu, rdtsc()); return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) + hv->tsc_ref.tsc_offset; } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); set_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); kvm_make_request(KVM_REQ_HV_STIMER, vcpu); if (vcpu_kick) kvm_vcpu_kick(vcpu); } static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); hrtimer_cancel(&stimer->timer); clear_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); stimer->msg_pending = false; stimer->exp_time = 0; } static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) { struct kvm_vcpu_hv_stimer *stimer; stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); stimer_mark_pending(stimer, true); return HRTIMER_NORESTART; } /* * stimer_start() assumptions: * a) stimer->count is not equal to 0 * b) stimer->config has HV_STIMER_ENABLE flag */ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) { u64 time_now; ktime_t ktime_now; time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); if (stimer->config.periodic) { if (stimer->exp_time) { if (time_now >= stimer->exp_time) { u64 remainder; div64_u64_rem(time_now - stimer->exp_time, stimer->count, &remainder); stimer->exp_time = time_now + (stimer->count - remainder); } } else stimer->exp_time = time_now + stimer->count; trace_kvm_hv_stimer_start_periodic( hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->exp_time); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->exp_time - time_now)), HRTIMER_MODE_ABS); return 0; } stimer->exp_time = stimer->count; if (time_now >= stimer->count) { /* * Expire timer according to Hypervisor Top-Level Functional * specification v4(15.3.1): * "If a one shot is enabled and the specified count is in * the past, it will expire immediately." */ stimer_mark_pending(stimer, false); return 0; } trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->count); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), HRTIMER_MODE_ABS); return 0; } static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || config)) return 1; if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && !(hv_vcpu->cpuid_cache.features_edx & HV_STIMER_DIRECT_MODE_AVAILABLE))) return 1; trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); if (old_config.enable && !new_config.direct_mode && new_config.sintx == 0) new_config.enable = 0; stimer->config.as_uint64 = new_config.as_uint64; if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || count)) return 1; trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); stimer_cleanup(stimer); stimer->count = count; if (!host) { if (stimer->count == 0) stimer->config.enable = 0; else if (stimer->config.auto_enable) stimer->config.enable = 1; } if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) { *pconfig = stimer->config.as_uint64; return 0; } static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) { *pcount = stimer->count; return 0; } static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg, bool no_retry) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); gfn_t msg_page_gfn; struct hv_message_header hv_hdr; int r; if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) return -ENOENT; msg_page_gfn = synic->msg_page >> PAGE_SHIFT; /* * Strictly following the spec-mandated ordering would assume setting * .msg_pending before checking .message_type. However, this function * is only called in vcpu context so the entire update is atomic from * guest POV and thus the exact order here doesn't matter. */ r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type, msg_off + offsetof(struct hv_message, header.message_type), sizeof(hv_hdr.message_type)); if (r < 0) return r; if (hv_hdr.message_type != HVMSG_NONE) { if (no_retry) return 0; hv_hdr.message_flags.msg_pending = 1; r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_flags, msg_off + offsetof(struct hv_message, header.message_flags), sizeof(hv_hdr.message_flags)); if (r < 0) return r; return -EAGAIN; } r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off, sizeof(src_msg->header) + src_msg->header.payload_size); if (r < 0) return r; r = synic_set_irq(synic, sint); if (r < 0) return r; if (r == 0) return -EFAULT; return 0; } static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; /* * To avoid piling up periodic ticks, don't retry message * delivery for them (within "lazy" lost ticks policy). */ bool no_retry = stimer->config.periodic; payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); return synic_deliver_msg(to_hv_synic(vcpu), stimer->config.sintx, msg, no_retry); } static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = stimer->config.apic_vector }; if (lapic_in_kernel(vcpu)) return !kvm_apic_set_irq(vcpu, &irq, NULL); return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) { int r, direct = stimer->config.direct_mode; stimer->msg_pending = true; if (!direct) r = stimer_send_msg(stimer); else r = stimer_notify_direct(stimer); trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, direct, r); if (!r) { stimer->msg_pending = false; if (!(stimer->config.periodic)) stimer->config.enable = 0; } } void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; u64 time_now, exp_time; int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; if (stimer->config.enable) { exp_time = stimer->exp_time; if (exp_time) { time_now = get_time_ref_counter(vcpu->kvm); if (time_now >= exp_time) stimer_expiration(stimer); } if ((stimer->config.enable) && stimer->count) { if (!stimer->msg_pending) stimer_start(stimer); } else stimer_cleanup(stimer); } } } void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_cleanup(&hv_vcpu->stimer[i]); kfree(hv_vcpu); vcpu->arch.hyperv = NULL; } bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu) return false; if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_assist_page_enabled); int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) return -EFAULT; return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; memset(&msg->header, 0, sizeof(msg->header)); msg->header.message_type = HVMSG_TIMER_EXPIRED; msg->header.payload_size = sizeof(*payload); payload->timer_index = stimer->index; payload->expiration_time = 0; payload->delivery_time = 0; } static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (hv_vcpu) return 0; hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT); if (!hv_vcpu) return -ENOMEM; vcpu->arch.hyperv = hv_vcpu; hv_vcpu->vcpu = vcpu; synic_init(&hv_vcpu->synic); bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); hv_vcpu->vp_index = vcpu->vcpu_idx; for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); } return 0; } int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { struct kvm_vcpu_hv_synic *synic; int r; r = kvm_hv_vcpu_init(vcpu); if (r) return r; synic = to_hv_synic(vcpu); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; return 0; } static bool kvm_hv_msr_partition_wide(u32 msr) { bool r = false; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: case HV_X64_MSR_REFERENCE_TSC: case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_RESET: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } return r; } static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); *pdata = hv->hv_crash_ctl; return 0; } static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; return 0; } static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } /* * The kvmclock and Hyper-V TSC page use similar formulas, and converting * between them is possible: * * kvmclock formula: * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * * Hyper-V formula: * nsec/100 = ticks * scale / 2^64 + offset * * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. * By dividing the kvmclock formula by 100 and equating what's left we get: * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 * * Now expand the kvmclock formula and divide by 100: * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * + system_time / 100 * * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: * nsec/100 = ticks * scale / 2^64 * - tsc_timestamp * scale / 2^64 * + system_time / 100 * * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: * offset = system_time / 100 - tsc_timestamp * scale / 2^64 * * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) return false; /* * check if scale would overflow, if so we use the time ref counter * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) */ max_mul = 100ull << (32 - hv_clock->tsc_shift); if (hv_clock->tsc_to_system_mul >= max_mul) return false; /* * Otherwise compute the scale and offset according to the formulas * derived above. */ tsc_ref->tsc_scale = mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), hv_clock->tsc_to_system_mul, 100); tsc_ref->tsc_offset = hv_clock->system_time; do_div(tsc_ref->tsc_offset, 100); tsc_ref->tsc_offset -= mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); return true; } /* * Don't touch TSC page values if the guest has opted for TSC emulation after * migration. KVM doesn't fully support reenlightenment notifications and TSC * access emulation and Hyper-V is known to expect the values in TSC page to * stay constant before TSC access emulation is disabled from guest side * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC * frequency and guest visible TSC value across migration (and prevent it when * TSC scaling is unsupported). */ static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) { return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && hv->hv_tsc_emulation_control; } void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { struct kvm_hv *hv = to_kvm_hv(kvm); u32 tsc_seq; u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); guard(mutex)(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) return; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* * Because the TSC parameters only vary when there is a * change in the master clock, do not bother with caching. */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) goto out_err; if (tsc_seq && tsc_page_update_unsafe(hv)) { if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; return; } /* * While we're computing and writing the parameters, force the * guest to use the time reference count MSR. */ hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. */ tsc_seq++; if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) tsc_seq = 1; /* Write the struct entirely before the non-zero sequence. */ smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; return; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; } void kvm_hv_request_tsc_page_update(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_lock(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET && !tsc_page_update_unsafe(hv)) hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; mutex_unlock(&hv->hv_lock); } static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) { if (!hv_vcpu->enforce_cpuid) return true; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_HYPERCALL_AVAILABLE; case HV_X64_MSR_VP_RUNTIME: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_RUNTIME_AVAILABLE; case HV_X64_MSR_TIME_REF_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_TIME_REF_COUNT_AVAILABLE; case HV_X64_MSR_VP_INDEX: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_INDEX_AVAILABLE; case HV_X64_MSR_RESET: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_RESET_AVAILABLE; case HV_X64_MSR_REFERENCE_TSC: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_REFERENCE_TSC_AVAILABLE; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNIC_AVAILABLE; case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNTIMER_AVAILABLE; case HV_X64_MSR_EOI: case HV_X64_MSR_ICR: case HV_X64_MSR_TPR: case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_FREQUENCY_MSRS; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; case HV_X64_MSR_TSC_INVARIANT_CONTROL: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_DEBUG_MSRS_AVAILABLE; default: break; } return false; } #define KVM_HV_WIN2016_GUEST_ID 0x1040a00003839 #define KVM_HV_WIN2016_GUEST_ID_MASK (~GENMASK_ULL(23, 16)) /* mask out the service version */ /* * Hyper-V enabled Windows Server 2016 SMP VMs fail to boot in !XSAVES && XSAVEC * configuration. * Such configuration can result from, for example, AMD Erratum 1386 workaround. * * Print a notice so users aren't left wondering what's suddenly gone wrong. */ static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); /* Check again under the hv_lock. */ if (hv->xsaves_xsavec_checked) return; if ((hv->hv_guest_os_id & KVM_HV_WIN2016_GUEST_ID_MASK) != KVM_HV_WIN2016_GUEST_ID) return; hv->xsaves_xsavec_checked = true; /* UP configurations aren't affected */ if (atomic_read(&kvm->online_vcpus) < 2) return; if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC)) return; pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " "If it fails to boot try disabling XSAVEC in the VM config.\n"); } void kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!vcpu->arch.hyperv_enabled || hv->xsaves_xsavec_checked) return; mutex_lock(&hv->hv_lock); __kvm_hv_xsaves_xsavec_maybe_warn(vcpu); mutex_unlock(&hv->hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: hv->hv_guest_os_id = data; /* setting guest os id to zero disables hypercall page */ if (!hv->hv_guest_os_id) hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; break; case HV_X64_MSR_HYPERCALL: { u8 instructions[9]; int i = 0; u64 addr; /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) break; if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { hv->hv_hypercall = data; break; } /* * If Xen and Hyper-V hypercalls are both enabled, disambiguate * the same way Xen itself does, by setting the bit 31 of EAX * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just * going to be clobbered on 64-bit. */ if (kvm_xen_hypercall_enabled(kvm)) { /* orl $0x80000000, %eax */ instructions[i++] = 0x0d; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x80; } /* vmcall/vmmcall */ kvm_x86_call(patch_hypercall)(vcpu, instructions + i); i += 3; /* ret */ ((unsigned char *)instructions)[i++] = 0xc3; addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) return 1; hv->hv_hypercall = data; break; } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { if (!host) hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; else hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); } else { hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, data); case HV_X64_MSR_CRASH_CTL: if (host) return kvm_hv_msr_set_crash_ctl(kvm, data); if (data & HV_CRASH_CTL_CRASH_NOTIFY) { vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", hv->hv_crash_param[0], hv->hv_crash_param[1], hv->hv_crash_param[2], hv->hv_crash_param[3], hv->hv_crash_param[4]); /* Send notification about crash to user space */ kvm_make_request(KVM_REQ_HV_CRASH, vcpu); } break; case HV_X64_MSR_RESET: if (data == 1) { vcpu_debug(vcpu, "hyper-v reset requested\n"); kvm_make_request(KVM_REQ_HV_RESET, vcpu); } break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: hv->hv_reenlightenment_control = data; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: if (data && !host) return 1; hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: /* Only bit 0 is supported */ if (data & ~HV_EXPOSE_INVARIANT_TSC) return 1; /* The feature can't be disabled from the guest */ if (!host && hv->hv_invtsc_control && !data) return 1; hv->hv_invtsc_control = data; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } /* Calculate cpu time spent by current task in 100ns units */ static u64 current_task_runtime_100ns(void) { u64 utime, stime; task_cputime_adjusted(current, &utime, &stime); return div_u64(utime + stime, 100); } static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) return 1; if (new_vp_index == hv_vcpu->vp_index) return 0; /* * The VP index is initialized to vcpu_index by * kvm_hv_vcpu_postcreate so they initially match. Now the * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; break; } case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT; addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; /* * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ if (__put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; break; } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); case HV_X64_MSR_VP_RUNTIME: if (!host) return 1; hv_vcpu->runtime_offset = data - current_task_runtime_100ns(); break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_set_msr(to_hv_synic(vcpu), msr, data, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_set_config(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_set_count(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: data = hv->hv_guest_os_id; break; case HV_X64_MSR_HYPERCALL: data = hv->hv_hypercall; break; case HV_X64_MSR_TIME_REF_COUNT: data = get_time_ref_counter(kvm); break; case HV_X64_MSR_REFERENCE_TSC: data = hv->hv_tsc_page; break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_get_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(kvm, pdata); case HV_X64_MSR_RESET: data = 0; break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: data = hv->hv_reenlightenment_control; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: data = hv->hv_tsc_emulation_control; break; case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: data = hv->hv_invtsc_control; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: data = hv_vcpu->vp_index; break; case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_VP_ASSIST_PAGE: data = hv_vcpu->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: data = current_task_runtime_100ns() + hv_vcpu->runtime_offset; break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_get_config(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_get_count(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_TSC_FREQUENCY: data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: data = div64_u64(1000000000ULL, vcpu->kvm->arch.apic_bus_cycle_ns); break; default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata, host); } static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, unsigned long *vcpu_mask) { struct kvm_hv *hv = to_kvm_hv(kvm); bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; int bank, sbank = 0; unsigned long i; u64 *bitmap; BUILD_BUG_ON(sizeof(vp_bitmap) > sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); /* * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else * fill a temporary buffer and manually test each vCPU's VP index. */ if (likely(!has_mismatch)) bitmap = (u64 *)vcpu_mask; else bitmap = vp_bitmap; /* * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask * having a '1' for each bank that exists in sparse_banks. Sets must * be in ascending order, i.e. bank0..bankN. */ memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) bitmap[bank] = sparse_banks[sbank++]; if (likely(!has_mismatch)) return; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) __set_bit(i, vcpu_mask); } } static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) { int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; unsigned long sbank; if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) return false; /* * The index into the sparse bank is the number of preceding bits in * the valid mask. Optimize for VMs with <64 vCPUs by skipping the * fancy math if there can't possibly be preceding bits. */ if (valid_bit_nr) sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); else sbank = 0; return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, (unsigned long *)&sparse_banks[sbank]); } struct kvm_hv_hcall { /* Hypercall input data */ u64 param; u64 ingpa; u64 outgpa; u16 code; u16 var_cnt; u16 rep_cnt; u16 rep_idx; bool fast; bool rep; sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; /* * Current read offset when KVM reads hypercall input data gradually, * either offset in bytes from 'ingpa' for regular hypercalls or the * number of already consumed 'XMM halves' for 'fast' hypercalls. */ union { gpa_t data_offset; int consumed_xmm_halves; }; }; static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, u16 orig_cnt, u16 cnt_cap, u64 *data) { /* * Preserve the original count when ignoring entries via a "cap", KVM * still needs to validate the guest input (though the non-XMM path * punts on the checks). */ u16 cnt = min(orig_cnt, cnt_cap); int i, j; if (hc->fast) { /* * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; for (i = 0; i < cnt; i++) { j = i + hc->consumed_xmm_halves; if (j % 2) data[i] = sse128_hi(hc->xmm[j / 2]); else data[i] = sse128_lo(hc->xmm[j / 2]); } return 0; } return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data, cnt * sizeof(*data)); } static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 *sparse_banks) { if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) return -EINVAL; /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, sparse_banks); } static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[]) { return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries); } static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, u64 *entries, int count) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; if (!hv_vcpu) return; spin_lock(&tlb_flush_fifo->write_lock); /* * All entries should fit on the fifo leaving one free for 'flush all' * entry in case another request comes in. In case there's not enough * space, just put 'flush all' entry there. */ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); goto out_unlock; } /* * Note: full fifo always contains 'flush all' entry, no need to check the * return value. */ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); out_unlock: spin_unlock(&tlb_flush_fifo->write_lock); } int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; int i, j, count; gva_t gva; if (!tdp_enabled || !hv_vcpu) return -EINVAL; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); for (i = 0; i < count; i++) { if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) goto out_flush_all; if (is_noncanonical_invlpg_address(entries[i], vcpu)) continue; /* * Lower 12 bits of 'address' encode the number of additional * pages to flush. */ gva = entries[i] & PAGE_MASK; for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); ++vcpu->stat.tlb_flush; } return 0; out_flush_all: kfifo_reset_out(&tlb_flush_fifo->entries); /* Fall back to full flush. */ return -ENOSPC; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); unsigned long *vcpu_mask = hv_vcpu->vcpu_mask; u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; /* * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' * entries on the TLB flush fifo. The last entry, however, needs to be * always left free for 'flush all' entry which gets placed when * there is not enough space to put all the requested entries. */ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; u64 *tlb_flush_entries; u64 valid_bank_mask; struct kvm_vcpu *v; unsigned long i; bool all_cpus; /* * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS * sparse banks. Fail the build if KVM's max allowed number of * vCPUs (>4096) exceeds this limit. */ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); /* * 'Slow' hypercall's first parameter is the address in guest's memory * where hypercall parameters are placed. This is either a GPA or a * nested GPA when KVM is handling the call from L2 ('direct' TLB * flush). Translate the address here so the memory can be uniformly * read with kvm_read_guest(). */ if (!hc->fast && is_guest_mode(vcpu)) { hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { if (hc->fast) { flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); hc->consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags, is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; /* * Work around possible WS2012 bug: it sends hypercalls * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, * while also expecting us to flush something and crashing if * we don't. Let's treat processor_mask == 0 same as * HV_FLUSH_ALL_PROCESSORS. */ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || flush.processor_mask == 0; } else { if (hc->fast) { flush_ex.address_space = hc->ingpa; flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); hc->consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (!all_cpus) { if (!hc->var_cnt) goto ret_success; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } /* * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' * case (HV_GENERIC_SET_ALL). Always adjust data_offset and * consumed_xmm_halves to make sure TLB flush entries are read * from the correct offset. */ if (hc->fast) hc->consumed_xmm_halves += hc->var_cnt; else hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]); } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { tlb_flush_entries = NULL; } else { if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries)) return HV_STATUS_INVALID_HYPERCALL_INPUT; tlb_flush_entries = __tlb_flush_entries; } /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus && !is_guest_mode(vcpu)) { kvm_for_each_vcpu(i, v, kvm) { tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else if (!is_guest_mode(vcpu)) { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { v = kvm_get_vcpu(kvm, i); if (!v) continue; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } else { struct kvm_vcpu_hv *hv_v; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, v, kvm) { hv_v = to_hv_vcpu(v); /* * The following check races with nested vCPUs entering/exiting * and/or migrating between L1's vCPUs, however the only case when * KVM *must* flush the TLB is when the target L2 vCPU keeps * running on the same L1 vCPU from the moment of the request until * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other * cases, e.g. when the target L2 vCPU migrates to a different L1 * vCPU or when the corresponding L1 vCPU temporary switches to a * different L2 vCPU while the request is being processed. */ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) continue; if (!all_cpus && !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, sparse_banks)) continue; __set_bit(i, vcpu_mask); tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ return (u64)HV_STATUS_SUCCESS | ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = vector }; struct kvm_vcpu *vcpu; unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (sparse_banks && !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ kvm_apic_set_irq(vcpu, &irq, NULL); } } static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; u64 valid_bank_mask; u32 vector; bool all_cpus; if (!lapic_in_kernel(vcpu)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; vector = send_ipi.vector; } else { /* 'reserved' part of hv_send_ipi should be 0 */ if (unlikely(hc->ingpa >> 32 != 0)) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = hc->outgpa; vector = (u32)hc->ingpa; } all_cpus = false; valid_bank_mask = BIT_ULL(0); trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, sizeof(send_ipi_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } else { send_ipi_ex.vector = (u32)hc->ingpa; send_ipi_ex.vp_set.format = hc->outgpa; send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]); } trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, send_ipi_ex.vp_set.format, send_ipi_ex.vp_set.valid_bank_mask); vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) goto check_and_send_ipi; if (!hc->var_cnt) goto ret_success; if (!hc->fast) hc->data_offset = offsetof(struct hv_send_ipi_ex, vp_set.bank_contents); else hc->consumed_xmm_halves = 1; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); else kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; } void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_cpuid_entry2 *entry; vcpu->arch.hyperv_enabled = hyperv_enabled; if (!hv_vcpu) { /* * KVM should have already allocated kvm_vcpu_hv if Hyper-V is * enabled in CPUID. */ WARN_ON_ONCE(vcpu->arch.hyperv_enabled); return; } memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache)); if (!vcpu->arch.hyperv_enabled) return; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; hv_vcpu->cpuid_cache.features_edx = entry->edx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES); if (entry) { hv_vcpu->cpuid_cache.nested_eax = entry->eax; hv_vcpu->cpuid_cache.nested_ebx = entry->ebx; } } int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) { struct kvm_vcpu_hv *hv_vcpu; int ret = 0; if (!to_hv_vcpu(vcpu)) { if (enforce) { ret = kvm_hv_vcpu_init(vcpu); if (ret) return ret; } else { return 0; } } hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->enforce_cpuid = enforce; return ret; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; longmode = is_64_bit_hypercall(vcpu); if (longmode) kvm_rax_write(vcpu, result); else { kvm_rdx_write(vcpu, result >> 32); kvm_rax_write(vcpu, result & 0xffffffff); } } static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { u32 tlb_lock_count = 0; int ret; if (hv_result_success(result) && is_guest_mode(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu) && kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, &tlb_lock_count, sizeof(tlb_lock_count))) result = HV_STATUS_INVALID_HYPERCALL_INPUT; trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; ret = kvm_skip_emulated_instruction(vcpu); if (tlb_lock_count) kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) { return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; if (unlikely(!hc->fast)) { int ret; gpa_t gpa = hc->ingpa; if ((gpa & (__alignof__(hc->ingpa) - 1)) || offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE) return HV_STATUS_INVALID_ALIGNMENT; ret = kvm_vcpu_read_guest(vcpu, gpa, &hc->ingpa, sizeof(hc->ingpa)); if (ret < 0) return HV_STATUS_INVALID_ALIGNMENT; } /* * Per spec, bits 32-47 contain the extra "flag number". However, we * have no use for it, and in all known usecases it is zero, so just * report lookup failure if it isn't. */ if (hc->ingpa & 0xffff00000000ULL) return HV_STATUS_INVALID_PORT_ID; /* remaining bits are reserved-zero */ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); eventfd = idr_find(&hv->conn_to_evt, hc->ingpa); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) { switch (hc->code) { case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: case HVCALL_SEND_IPI_EX: return true; } return false; } static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) { int reg; kvm_fpu_get(); for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) _kvm_read_sse_reg(reg, &hc->xmm[reg]); kvm_fpu_put(); } static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) { if (!hv_vcpu->enforce_cpuid) return true; switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: return hv_vcpu->cpuid_cache.enlightenments_ebx && hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; case HVCALL_POST_MESSAGE: return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; case HVCALL_SIGNAL_EVENT: return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: case HVCALL_RESET_DEBUG_SESSION: /* * Return 'true' when SynDBG is disabled so the resulting code * will be HV_STATUS_INVALID_HYPERCALL_CODE. */ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; case HVCALL_SEND_IPI_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: return hv_vcpu->cpuid_cache.features_ebx & HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } return true; } int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; /* * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #ifdef CONFIG_X86_64 if (is_64_bit_hypercall(vcpu)) { hc.param = kvm_rcx_read(vcpu); hc.ingpa = kvm_rdx_read(vcpu); hc.outgpa = kvm_r8_read(vcpu); } else #endif { hc.param = ((u64)kvm_rdx_read(vcpu) << 32) | (kvm_rax_read(vcpu) & 0xffffffff); hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | (kvm_rcx_read(vcpu) & 0xffffffff); hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | (kvm_rsi_read(vcpu) & 0xffffffff); } hc.code = hc.param & 0xffff; hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; goto hypercall_complete; } if (hc.fast && is_xmm_fast_hypercall(&hc)) { if (unlikely(hv_vcpu->enforce_cpuid && !(hv_vcpu->cpuid_cache.features_edx & HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } kvm_hv_hypercall_read_xmm(&hc); } switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hvcall_signal_event(vcpu, &hc); if (ret != HV_STATUS_INVALID_PORT_ID) break; fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_SEND_IPI: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_SEND_IPI_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_send_ipi(vcpu, &hc); break; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } fallthrough; case HVCALL_RESET_DEBUG_SESSION: { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu)) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) { ret = HV_STATUS_OPERATION_DENIED; break; } goto hypercall_userspace_exit; } case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); hypercall_userspace_exit: vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; vcpu->run->hyperv.u.hcall.input = hc.param; vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; } void kvm_hv_init_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_init(&hv->hv_lock); idr_init(&hv->conn_to_evt); } void kvm_hv_destroy_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int i; idr_for_each_entry(&hv->conn_to_evt, eventfd, i) eventfd_ctx_put(eventfd); idr_destroy(&hv->conn_to_evt); } static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int ret; eventfd = eventfd_ctx_fdget(fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) return 0; if (ret == -ENOSPC) ret = -EEXIST; eventfd_ctx_put(eventfd); return ret; } static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; mutex_lock(&hv->hv_lock); eventfd = idr_remove(&hv->conn_to_evt, conn_id); mutex_unlock(&hv->hv_lock); if (!eventfd) return -ENOENT; synchronize_srcu(&kvm->srcu); eventfd_ctx_put(eventfd); return 0; } int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) { if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) || (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK)) return -EINVAL; if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN) return kvm_hv_eventfd_deassign(kvm, args->conn_id); return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_INTERFACE }, { .function = HYPERV_CPUID_VERSION }, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); if (kvm_x86_ops.nested_ops->get_evmcs_version) evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); if (cpuid->nent < nent) return -E2BIG; if (cpuid->nent > nent) cpuid->nent = nent; for (i = 0; i < nent; i++) { struct kvm_cpuid_entry2 *ent = &cpuid_entries[i]; u32 signature[3]; switch (ent->function) { case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_INTERFACE: ent->eax = HYPERV_CPUID_SIGNATURE_EAX; break; case HYPERV_CPUID_VERSION: /* * We implement some Hyper-V 2016 functions so let's use * this version. */ ent->eax = 0x00003839; ent->ebx = 0x000A0000; break; case HYPERV_CPUID_FEATURES: ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE; ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; ent->eax |= HV_MSR_SYNIC_AVAILABLE; ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE; ent->eax |= HV_MSR_HYPERCALL_AVAILABLE; ent->eax |= HV_MSR_VP_INDEX_AVAILABLE; ent->eax |= HV_MSR_RESET_AVAILABLE; ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; ent->ebx |= HV_DEBUGGING; ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; /* * Direct Synthetic timers only make sense with in-kernel * LAPIC */ if (!vcpu || lapic_in_kernel(vcpu)) ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; case HYPERV_CPUID_ENLIGHTMENT_INFO: ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; if (!vcpu || lapic_in_kernel(vcpu)) ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. */ ent->ebx = 0x00000FFF; break; case HYPERV_CPUID_IMPLEMENT_LIMITS: /* Maximum number of virtual processors */ ent->eax = KVM_MAX_VCPUS; /* * Maximum number of logical processors, matches * HyperV 2016. */ ent->ebx = 64; break; case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; ent->eax |= HV_X64_NESTED_MSR_BITMAP; ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; break; case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = 0; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_SYNDBG_INTERFACE: memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); ent->eax = signature[0]; break; case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; break; default: break; } } if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; return 0; }
20 9 13 71 1 1 2 67 66 5 1 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-or-later /** -*- linux-c -*- *********************************************************** * Linux PPP over X/Ethernet (PPPoX/PPPoE) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoE --- PPP over Ethernet (RFC 2516) * * Version: 0.5.2 * * Author: Michal Ostrowski <mostrows@speakeasy.net> * * 051000 : Initialization cleanup * * License: */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/init.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/kmod.h> #include <net/sock.h> #include <linux/uaccess.h> static const struct pppox_proto *pppox_protos[PX_MAX_PROTO + 1]; int register_pppox_proto(int proto_num, const struct pppox_proto *pp) { if (proto_num < 0 || proto_num > PX_MAX_PROTO) return -EINVAL; if (pppox_protos[proto_num]) return -EALREADY; pppox_protos[proto_num] = pp; return 0; } void unregister_pppox_proto(int proto_num) { if (proto_num >= 0 && proto_num <= PX_MAX_PROTO) pppox_protos[proto_num] = NULL; } void pppox_unbind_sock(struct sock *sk) { /* Clear connection to ppp device, if attached. */ if (sk->sk_state & (PPPOX_BOUND | PPPOX_CONNECTED)) { ppp_unregister_channel(&pppox_sk(sk)->chan); sk->sk_state = PPPOX_DEAD; } } EXPORT_SYMBOL(register_pppox_proto); EXPORT_SYMBOL(unregister_pppox_proto); EXPORT_SYMBOL(pppox_unbind_sock); int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int rc; lock_sock(sk); switch (cmd) { case PPPIOCGCHAN: { int index; rc = -ENOTCONN; if (!(sk->sk_state & PPPOX_CONNECTED)) break; rc = -EINVAL; index = ppp_channel_index(&po->chan); if (put_user(index , (int __user *) arg)) break; rc = 0; sk->sk_state |= PPPOX_BOUND; break; } default: rc = pppox_protos[sk->sk_protocol]->ioctl ? pppox_protos[sk->sk_protocol]->ioctl(sock, cmd, arg) : -ENOTTY; } release_sock(sk); return rc; } EXPORT_SYMBOL(pppox_ioctl); #ifdef CONFIG_COMPAT int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { if (cmd == PPPOEIOCSFWD32) cmd = PPPOEIOCSFWD; return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(pppox_compat_ioctl); #endif static int pppox_create(struct net *net, struct socket *sock, int protocol, int kern) { int rc = -EPROTOTYPE; if (protocol < 0 || protocol > PX_MAX_PROTO) goto out; rc = -EPROTONOSUPPORT; if (!pppox_protos[protocol]) request_module("net-pf-%d-proto-%d", PF_PPPOX, protocol); if (!pppox_protos[protocol] || !try_module_get(pppox_protos[protocol]->owner)) goto out; rc = pppox_protos[protocol]->create(net, sock, kern); module_put(pppox_protos[protocol]->owner); out: return rc; } static const struct net_proto_family pppox_proto_family = { .family = PF_PPPOX, .create = pppox_create, .owner = THIS_MODULE, }; static int __init pppox_init(void) { return sock_register(&pppox_proto_family); } static void __exit pppox_exit(void) { sock_unregister(PF_PPPOX); } module_init(pppox_init); module_exit(pppox_exit); MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>"); MODULE_DESCRIPTION("PPP over Ethernet driver (generic socket layer)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PPPOX);
25 25 25 25 25 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" void irq_pm_handle_wakeup(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); pm_system_irq_wakeup(irq_desc_get_irq(desc)); } /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. */ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions++; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth++; WARN_ON_ONCE(desc->force_resume_depth && desc->force_resume_depth != desc->nr_actions); if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* * Called from __free_irq() with desc->lock held after @action has * been removed from the action chain. */ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions--; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth--; if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc) { unsigned long chipflags = irq_desc_get_chip(desc)->flags; struct irq_data *irqd = &desc->irq_data; if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(irqd)) { irqd_set(irqd, IRQD_WAKEUP_ARMED); if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && irqd_irq_disabled(irqd)) { /* * Interrupt marked for wakeup is in disabled state. * Enable interrupt here to unmask/enable in irqchip * to be able to resume with such interrupts. */ __enable_irq(desc); irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the * IRQD_WAKEUP_ARMED is visible before we return from * suspend_device_irqs(). */ return true; } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility * requires that the non wakeup interrupts are masked at the * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } /** * suspend_device_irqs - disable all currently enabled interrupt lines * * During system-wide suspend or hibernation device drivers need to be * prevented from receiving interrupts and this function is provided * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND * set and those which are marked as active wakeup sources. * * The active wakeup sources are handled by the flow handler entry * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { bool sync; if (irq_settings_is_nested_thread(desc)) continue; scoped_guard(raw_spinlock_irqsave, &desc->lock) sync = suspend_device_irq(desc); if (sync) synchronize_irq(irq); } } static void resume_irq(struct irq_desc *desc) { struct irq_data *irqd = &desc->irq_data; irqd_clear(irqd, IRQD_WAKEUP_ARMED); if (irqd_is_enabled_on_suspend(irqd)) { /* * Interrupt marked for wakeup was enabled during suspend * entry. Disable such interrupts to restore them back to * original state. */ __disable_irq(desc); irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } if (desc->istate & IRQS_SUSPENDED) goto resume; /* Force resume the interrupt? */ if (!desc->force_resume_depth) return; /* Pretend that it got disabled ! */ desc->depth++; irq_state_set_disabled(desc); irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); } static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; guard(raw_spinlock_irqsave)(&desc->lock); resume_irq(desc); } } /** * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup * @irq: Interrupt to rearm */ void rearm_wake_irq(unsigned int irq) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) return; desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); } } /** * irq_pm_syscore_resume - enable interrupt lines early * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ static void irq_pm_syscore_resume(void) { resume_irqs(true); } static struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; static int __init irq_pm_init_ops(void) { register_syscore_ops(&irq_pm_syscore_ops); return 0; } device_initcall(irq_pm_init_ops); /** * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag * set as well as those with %IRQF_FORCE_RESUME. */ void resume_device_irqs(void) { resume_irqs(false); }
81 285 299 252 451 441 122 25 25 25 279 280 250 237 182 22 3 146 7 66 7 69 2 3 2 46 23 43 32 41 22 9 75 7 43 132 447 36 288 287 288 123 122 123 11 123 123 123 286 54 54 1 1 286 285 282 101 286 286 286 1 1 1 290 288 283 283 277 285 8 291 291 291 291 281 82 285 285 285 282 283 281 447 406 434 16 114 18 113 136 132 57 137 137 53 99 82 3 82 13 450 91 58 421 422 113 407 447 28 4 451 450 450 446 28 6 451 137 433 47 58 451 308 279 114 410 409 406 219 113 66 59 451 451 445 29 451 451 114 410 441 117 448 28 408 408 114 113 58 58 58 286 314 199 195 199 36 36 9 9 9 9 9 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/sort.h> #include "messages.h" #include "ctree.h" #include "delayed-ref.h" #include "extent-tree.h" #include "transaction.h" #include "qgroup.h" #include "space-info.h" #include "tree-mod-log.h" #include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_ref_node_cachep; struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees * we queue up extent allocations and backref maintenance for * delayed processing. This avoids deep call chains where we * add extents in the middle of btrfs_search_slot, and it allows * us to buffer up frequently modified backrefs in an rb tree instead * of hammering updates on the extent allocation tree. */ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; bool ret = false; u64 reserved; spin_lock(&global_rsv->lock); reserved = global_rsv->reserved; spin_unlock(&global_rsv->lock); /* * Since the global reserve is just kind of magic we don't really want * to rely on it to save our bacon, so if our size is more than the * delayed_refs_rsv and the global rsv then it's time to think about * bailing. */ spin_lock(&delayed_refs_rsv->lock); reserved += delayed_refs_rsv->reserved; if (delayed_refs_rsv->size >= reserved) ret = true; spin_unlock(&delayed_refs_rsv->lock); return ret; } /* * Release a ref head's reservation. * * @fs_info: the filesystem * @nr_refs: number of delayed refs to drop * @nr_csums: number of csum items to drop * * Drops the delayed ref head's count from the delayed refs rsv and free any * excess reservation we had. */ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; u64 num_bytes; u64 released; num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Adjust the size of the delayed refs rsv. * * This is to be called anytime we may have adjusted trans->delayed_ref_updates * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and * add it to the delayed_refs_rsv. */ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; u64 num_bytes; u64 reserved_bytes; if (btrfs_is_testing(fs_info)) return; num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, trans->delayed_ref_csum_deletions); if (num_bytes == 0) return; /* * Try to take num_bytes from the transaction's local delayed reserve. * If not possible, try to take as much as it's available. If the local * reserve doesn't have enough reserved space, the delayed refs reserve * will be refilled next time btrfs_delayed_refs_rsv_refill() is called * by someone or if a transaction commit is triggered before that, the * global block reserve will be used. We want to minimize using the * global block reserve for cases we can account for in advance, to * avoid exhausting it and reach -ENOSPC during a transaction commit. */ spin_lock(&local_rsv->lock); reserved_bytes = min(num_bytes, local_rsv->reserved); local_rsv->reserved -= reserved_bytes; local_rsv->full = (local_rsv->reserved >= local_rsv->size); spin_unlock(&local_rsv->lock); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; delayed_rsv->reserved += reserved_bytes; delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; trans->delayed_ref_csum_deletions = 0; } /* * Adjust the size of the delayed refs block reserve for 1 block group item * insertion, used after allocating a block group. */ void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; spin_lock(&delayed_rsv->lock); /* * Inserting a block group item does not require changing the free space * tree, only the extent tree or the block group tree, so this is all we * need. */ delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); } /* * Adjust the size of the delayed refs block reserve to release space for 1 * block group item insertion. */ void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); u64 released; released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); if (released > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Adjust the size of the delayed refs block reserve for 1 block group item * update. */ void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; spin_lock(&delayed_rsv->lock); /* * Updating a block group item does not result in new nodes/leaves and * does not require changing the free space tree, only the extent tree * or the block group tree, so this is all we need. */ delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); } /* * Adjust the size of the delayed refs block reserve to release space for 1 * block group item update. */ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); u64 released; released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); if (released > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); } /* * Refill based on our delayed refs usage. * * @fs_info: the filesystem * @flush: control how we can flush for this reservation. * * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. */ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_space_info *space_info = block_rsv->space_info; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; u64 refilled_bytes; u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) { num_bytes = block_rsv->size - block_rsv->reserved; num_bytes = min(num_bytes, limit); } spin_unlock(&block_rsv->lock); if (!num_bytes) return 0; ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); if (ret) return ret; /* * We may have raced with someone else, so check again if we the block * reserve is still not full and release any excess space. */ spin_lock(&block_rsv->lock); if (block_rsv->reserved < block_rsv->size) { u64 needed = block_rsv->size - block_rsv->reserved; if (num_bytes >= needed) { block_rsv->reserved += needed; block_rsv->full = true; to_free = num_bytes - needed; refilled_bytes = needed; } else { block_rsv->reserved += num_bytes; to_free = 0; refilled_bytes = num_bytes; } } else { to_free = num_bytes; refilled_bytes = 0; } spin_unlock(&block_rsv->lock); if (to_free > 0) btrfs_space_info_free_bytes_may_use(space_info, to_free); if (refilled_bytes > 0) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, refilled_bytes, 1); return 0; } /* * compare two delayed data backrefs with same bytenr and type */ static int comp_data_refs(const struct btrfs_delayed_ref_node *ref1, const struct btrfs_delayed_ref_node *ref2) { if (ref1->data_ref.objectid < ref2->data_ref.objectid) return -1; if (ref1->data_ref.objectid > ref2->data_ref.objectid) return 1; if (ref1->data_ref.offset < ref2->data_ref.offset) return -1; if (ref1->data_ref.offset > ref2->data_ref.offset) return 1; return 0; } static int comp_refs(const struct btrfs_delayed_ref_node *ref1, const struct btrfs_delayed_ref_node *ref2, bool check_seq) { int ret = 0; if (ref1->type < ref2->type) return -1; if (ref1->type > ref2->type) return 1; if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_DATA_REF_KEY) { if (ref1->parent < ref2->parent) return -1; if (ref1->parent > ref2->parent) return 1; } else { if (ref1->ref_root < ref2->ref_root) return -1; if (ref1->ref_root > ref2->ref_root) return 1; if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) ret = comp_data_refs(ref1, ref2); } if (ret) return ret; if (check_seq) { if (ref1->seq < ref2->seq) return -1; if (ref1->seq > ref2->seq) return 1; } return 0; } static int cmp_refs_node(const struct rb_node *new, const struct rb_node *exist) { const struct btrfs_delayed_ref_node *new_node = rb_entry(new, struct btrfs_delayed_ref_node, ref_node); const struct btrfs_delayed_ref_node *exist_node = rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); return comp_refs(new_node, exist_node, true); } static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { struct rb_node *node = &ins->ref_node; struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node); return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node); } static struct btrfs_delayed_ref_head *find_first_ref_head( struct btrfs_delayed_ref_root *dr) { unsigned long from = 0; lockdep_assert_held(&dr->lock); return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT); } static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { lockdep_assert_held(&delayed_refs->lock); if (mutex_trylock(&head->mutex)) return true; refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); if (!head->tracked) { mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return false; } btrfs_put_delayed_ref_head(head); return true; } static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { lockdep_assert_held(&head->lock); rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); btrfs_put_delayed_ref(ref); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } static bool merge_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) { struct btrfs_delayed_ref_node *next; struct rb_node *node = rb_next(&ref->ref_node); bool done = false; while (!done && node) { int mod; next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); node = rb_next(node); if (seq && next->seq >= seq) break; if (comp_refs(ref, next, false)) break; if (ref->action == next->action) { mod = next->ref_mod; } else { if (ref->ref_mod < next->ref_mod) { swap(ref, next); done = true; } mod = -next->ref_mod; } drop_delayed_ref(fs_info, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { drop_delayed_ref(fs_info, delayed_refs, head, ref); done = true; } else { /* * Can't have multiples of the same ref on a tree block. */ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } } return done; } void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; struct rb_node *node; u64 seq = 0; lockdep_assert_held(&head->lock); if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return; /* We don't have too many refs to merge for data. */ if (head->is_data) return; seq = btrfs_tree_mod_log_lowest_seq(fs_info); again: for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; if (merge_ref(fs_info, delayed_refs, head, ref, seq)) goto again; } } int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { int ret = 0; u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); if (min_seq != 0 && seq >= min_seq) { btrfs_debug(fs_info, "holding back delayed_ref %llu, lowest is %llu", seq, min_seq); ret = 1; } return ret; } struct btrfs_delayed_ref_head *btrfs_select_ref_head( const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs) { struct btrfs_delayed_ref_head *head; unsigned long start_index; unsigned long found_index; bool found_head = false; bool locked; spin_lock(&delayed_refs->lock); again: start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits); xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) { if (!head->processing) { found_head = true; break; } } if (!found_head) { if (delayed_refs->run_delayed_start == 0) { spin_unlock(&delayed_refs->lock); return NULL; } delayed_refs->run_delayed_start = 0; goto again; } head->processing = true; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + head->num_bytes; locked = btrfs_delayed_ref_lock(delayed_refs, head); spin_unlock(&delayed_refs->lock); /* * We may have dropped the spin lock to get the head mutex lock, and * that might have given someone else time to free the head. If that's * true, it has been removed from our list and we can move on. */ if (!locked) return ERR_PTR(-EAGAIN); return head; } void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { spin_lock(&delayed_refs->lock); head->processing = false; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); btrfs_delayed_ref_unlock(head); } void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits); lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&head->lock); xa_erase(&delayed_refs->head_refs, index); head->tracked = false; delayed_refs->num_heads--; if (!head->processing) delayed_refs->num_heads_ready--; } struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; lockdep_assert_held(&head->mutex); lockdep_assert_held(&head->lock); if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) return NULL; /* * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. * This is to prevent a ref count from going down to zero, which deletes * the extent item from the extent tree, when there still are references * to add, which would fail because they would not find the extent item. */ if (!list_empty(&head->ref_add_list)) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); ref = rb_entry(rb_first_cached(&head->ref_tree), struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; } /* * Helper to insert the ref_node to the tail or merge with tail. * * Return false if the ref was inserted. * Return true if the ref was merged into an existing one (and therefore can be * freed by the caller). */ static bool insert_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; struct btrfs_delayed_ref_node *exist; int mod; spin_lock(&href->lock); exist = tree_insert(&href->ref_tree, ref); if (!exist) { if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); spin_unlock(&href->lock); trans->delayed_ref_updates++; return false; } /* Now we are sure we can merge */ if (exist->action == ref->action) { mod = ref->ref_mod; } else { /* Need to change action */ if (exist->ref_mod < ref->ref_mod) { exist->action = ref->action; mod = -exist->ref_mod; exist->ref_mod = ref->ref_mod; if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&exist->add_list, &href->ref_add_list); else if (ref->action == BTRFS_DROP_DELAYED_REF) { ASSERT(!list_empty(&exist->add_list)); list_del_init(&exist->add_list); } else { ASSERT(0); } } else mod = -ref->ref_mod; } exist->ref_mod += mod; /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) drop_delayed_ref(trans->fs_info, root, href, exist); spin_unlock(&href->lock); return true; } /* * helper function to update the accounting in the head ref * existing and update must have the same bytenr */ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update) { struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); spin_lock(&existing->lock); /* * When freeing an extent, we may not know the owning root when we * first create the head_ref. However, some deref before the last deref * will know it, so we just need to update the head_ref accordingly. */ if (!existing->owning_root) existing->owning_root = update->owning_root; if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref * entries were processed, we can end up * with an existing head ref without * the must_insert_reserved flag set. * Set it again here */ existing->must_insert_reserved = update->must_insert_reserved; existing->owning_root = update->owning_root; /* * update the num_bytes so we make sure the accounting * is done correctly */ existing->num_bytes = update->num_bytes; } if (update->extent_op) { if (!existing->extent_op) { existing->extent_op = update->extent_op; } else { if (update->extent_op->update_key) { memcpy(&existing->extent_op->key, &update->extent_op->key, sizeof(update->extent_op->key)); existing->extent_op->update_key = true; } if (update->extent_op->update_flags) { existing->extent_op->flags_to_set |= update->extent_op->flags_to_set; existing->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(update->extent_op); } } /* * update the reference mod on the head to reflect this new operation, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing->total_ref_mod; existing->ref_mod += update->ref_mod; existing->total_ref_mod += update->ref_mod; /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. * We reserve bytes for csum deletion when adding or updating a ref head * see add_delayed_ref_head() for more details. */ if (existing->is_data) { u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, existing->num_bytes); if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); } if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; trans->delayed_ref_csum_deletions += csum_leaves; } } spin_unlock(&existing->lock); } static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_ref *generic_ref, struct btrfs_qgroup_extent_record *qrecord, u64 reserved) { int count_mod = 1; bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); switch (generic_ref->action) { case BTRFS_ADD_DELAYED_REF: /* count_mod is already set to 1. */ break; case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; break; case BTRFS_DROP_DELAYED_REF: /* * The head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ count_mod = -1; break; case BTRFS_ADD_DELAYED_EXTENT: /* * BTRFS_ADD_DELAYED_EXTENT means that we need to update the * reserved accounting when the extent is finally added, or if a * later modification deletes the delayed ref without ever * inserting the extent into the extent allocation tree. * ref->must_insert_reserved is the flag used to record that * accounting mods are required. * * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not * required. */ must_insert_reserved = true; break; } refcount_set(&head_ref->refs, 1); head_ref->bytenr = generic_ref->bytenr; head_ref->num_bytes = generic_ref->num_bytes; head_ref->ref_mod = count_mod; head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; head_ref->owning_root = generic_ref->owning_root; head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); head_ref->tracked = false; head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); /* If not metadata set an impossible level to help debugging. */ if (generic_ref->type == BTRFS_REF_METADATA) head_ref->level = generic_ref->tree_ref.level; else head_ref->level = U8_MAX; if (qrecord) { if (generic_ref->ref_root && reserved) { qrecord->data_rsv = reserved; qrecord->data_rsv_refroot = generic_ref->ref_root; } qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } } /* * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct * overall modification count. * * Returns an error pointer in case of an error. */ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, int action, bool *qrecord_inserted_ret) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); #if BITS_PER_LONG == 32 if (head_ref->bytenr >= MAX_LFS_FILESIZE) { if (qrecord) xa_release(&delayed_refs->dirty_extents, index); btrfs_err_rl(fs_info, "delayed ref head %llu is beyond 32bit page cache and xarray index limit", head_ref->bytenr); btrfs_err_32bit_limit(fs_info); return ERR_PTR(-EOVERFLOW); } #endif /* Record qgroup extent info if provided */ if (qrecord) { int ret; ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, head_ref->bytenr); if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); kfree(qrecord); } else { qrecord_inserted = true; } } trace_add_delayed_ref_head(fs_info, head_ref, action); existing = xa_load(&delayed_refs->head_refs, index); if (existing) { update_existing_head_ref(trans, existing, head_ref); /* * we've updated the existing ref, free the newly * allocated ref */ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC); if (xa_is_err(existing)) { /* Memory was preallocated by the caller. */ ASSERT(xa_err(existing) != -ENOMEM); return ERR_PTR(xa_err(existing)); } else if (WARN_ON(existing)) { /* * Shouldn't happen we just did a lookup before under * delayed_refs->lock. */ return ERR_PTR(-EEXIST); } head_ref->tracked = true; /* * We reserve the amount of bytes needed to delete csums when * adding the ref head and not when adding individual drop refs * since the csum items are deleted only after running the last * delayed drop ref (the data extent's ref count drops to 0). */ if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_csum_deletions += btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; return head_ref; } /* * Initialize the structure which represents a modification to an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * * @ref: The structure which is going to be initialized. * * @bytenr: The logical address of the extent for which a modification is * going to be recorded. * * @num_bytes: Size of the extent whose modification is being recorded. * * @ref_root: The id of the root where this modification has originated, this * can be either one of the well-known metadata trees or the * subvolume id which references this extent. * * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or * BTRFS_ADD_DELAYED_EXTENT * * @ref_type: Holds the type of the extent which is being recorded, can be * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ * BTRFS_EXTENT_DATA_REF_KEY when recording data extent */ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, struct btrfs_ref *generic_ref) { int action = generic_ref->action; u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; if (btrfs_is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); ref->bytenr = generic_ref->bytenr; ref->num_bytes = generic_ref->num_bytes; ref->ref_mod = 1; ref->action = action; ref->seq = seq; ref->type = btrfs_ref_type(generic_ref); ref->ref_root = generic_ref->ref_root; ref->parent = generic_ref->parent; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); if (generic_ref->type == BTRFS_REF_DATA) ref->data_ref = generic_ref->data_ref; else ref->tree_ref = generic_ref->tree_ref; } void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif generic_ref->tree_ref.level = level; generic_ref->type = BTRFS_REF_METADATA; if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; } void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_DEBUG /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif generic_ref->data_ref.objectid = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) && (!mod_root || btrfs_is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; } static int add_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op, u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits); bool qrecord_reserved = false; bool qrecord_inserted; int action = generic_ref->action; bool merged; int ret; node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); if (!node) return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { ret = -ENOMEM; goto free_node; } delayed_refs = &trans->transaction->delayed_refs; if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { ret = -ENOMEM; goto free_head_ref; } if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) { ret = -ENOMEM; goto free_record; } qrecord_reserved = true; } ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); if (ret) { if (qrecord_reserved) xa_release(&delayed_refs->dirty_extents, index); goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; spin_lock(&delayed_refs->lock); /* * insert both the head node and the new ref without dropping * the spin lock */ new_head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); if (IS_ERR(new_head_ref)) { xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); ret = PTR_ERR(new_head_ref); goto free_record; } head_ref = new_head_ref; merged = insert_delayed_ref(trans, head_ref, node); spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have * made. */ btrfs_update_delayed_refs_rsv(trans); if (generic_ref->type == BTRFS_REF_DATA) trace_add_delayed_data_ref(trans->fs_info, node); else trace_add_delayed_tree_ref(trans->fs_info, node); if (merged) kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); return 0; free_record: kfree(record); free_head_ref: kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); free_node: kmem_cache_free(btrfs_delayed_ref_node_cachep, node); return ret; } /* * Add a delayed tree ref. This does all of the accounting required to make sure * the delayed ref is eventually processed before this transaction commits. */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, struct btrfs_delayed_extent_op *extent_op) { ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); return add_delayed_ref(trans, generic_ref, extent_op, 0); } /* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved) { ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); return add_delayed_ref(trans, generic_ref, NULL, reserved); } int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u8 level, struct btrfs_delayed_extent_op *extent_op) { const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits); struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_ref generic_ref = { .type = BTRFS_REF_METADATA, .action = BTRFS_UPDATE_DELAYED_HEAD, .bytenr = bytenr, .num_bytes = num_bytes, .tree_ref.level = level, }; int ret; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS); if (ret) { kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return ret; } spin_lock(&delayed_refs->lock); head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, NULL); if (IS_ERR(head_ref_ret)) { xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return PTR_ERR(head_ref_ret); } spin_unlock(&delayed_refs->lock); /* * Need to update the delayed_refs_rsv with any changes we may have * made. */ btrfs_update_delayed_refs_rsv(trans); return 0; } void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { if (refcount_dec_and_test(&ref->refs)) { WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); } } /* * This does a simple search for the head node for a given extent. Returns the * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { const unsigned long index = (bytenr >> fs_info->sectorsize_bits); lockdep_assert_held(&delayed_refs->lock); return xa_load(&delayed_refs->head_refs, index); } static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) { int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; if (type < entry->type) return -1; if (type > entry->type) return 1; if (type == BTRFS_TREE_BLOCK_REF_KEY) { if (root < entry->ref_root) return -1; if (root > entry->ref_root) return 1; } else { if (parent < entry->parent) return -1; if (parent > entry->parent) return 1; } return 0; } /* * Check to see if a given root/parent reference is attached to the head. This * only checks for BTRFS_ADD_DELAYED_REF references that match, as that * indicates the reference exists for the given root or parent. This is for * tree blocks only. * * @head: the head of the bytenr we're searching. * @root: the root objectid of the reference if it is a normal reference. * @parent: the parent if this is a shared backref. */ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, u64 root, u64 parent) { struct rb_node *node; bool found = false; lockdep_assert_held(&head->mutex); spin_lock(&head->lock); node = head->ref_tree.rb_root.rb_node; while (node) { struct btrfs_delayed_ref_node *entry; int ret; entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); ret = find_comp(entry, root, parent); if (ret < 0) { node = node->rb_left; } else if (ret > 0) { node = node->rb_right; } else { /* * We only want to count ADD actions, as drops mean the * ref doesn't exist. */ if (entry->action == BTRFS_ADD_DELAYED_REF) found = true; break; } } spin_unlock(&head->lock); return found; } void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans) { struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; spin_lock(&delayed_refs->lock); while (true) { struct btrfs_delayed_ref_head *head; struct rb_node *n; bool pin_bytes = false; head = find_first_ref_head(delayed_refs); if (!head) break; if (!btrfs_delayed_ref_lock(delayed_refs, head)) continue; spin_lock(&head->lock); while ((n = rb_first_cached(&head->ref_tree)) != NULL) { struct btrfs_delayed_ref_node *ref; ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); drop_delayed_ref(fs_info, delayed_refs, head, ref); } if (head->must_insert_reserved) pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (!btrfs_is_testing(fs_info) && pin_bytes) { struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, head->bytenr); if (WARN_ON_ONCE(bg == NULL)) { /* * Unexpected and there's nothing we can do here * because we are in a transaction abort path, * so any errors can only be ignored or reported * while attempting to cleanup all resources. */ btrfs_err(fs_info, "block group for delayed ref at %llu was not found while destroying ref head", head->bytenr); } else { spin_lock(&bg->space_info->lock); spin_lock(&bg->lock); bg->pinned += head->num_bytes; btrfs_space_info_update_bytes_pinned(bg->space_info, head->num_bytes); bg->reserved -= head->num_bytes; bg->space_info->bytes_reserved -= head->num_bytes; spin_unlock(&bg->lock); spin_unlock(&bg->space_info->lock); btrfs_put_block_group(bg); } btrfs_error_unpin_extent_range(fs_info, head->bytenr, head->bytenr + head->num_bytes - 1); } if (!btrfs_is_testing(fs_info)) btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } if (!btrfs_is_testing(fs_info)) btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); } void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); kmem_cache_destroy(btrfs_delayed_ref_node_cachep); kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) return -ENOMEM; btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); if (!btrfs_delayed_ref_node_cachep) goto fail; btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); if (!btrfs_delayed_extent_op_cachep) goto fail; return 0; fail: btrfs_delayed_ref_exit(); return -ENOMEM; }
8 30 30 30 4 4 5 1 1 1 36 36 36 3 1 4 2 2 28 1 1 1 4 3 9 2 4 1 2 33 33 5 5 28 28 31 2 27 1 1 14 2 2 2 2 2 54 53 54 54 1 53 33 13 54 54 41 13 13 54 53 63 54 45 45 18 13 13 13 14 14 14 13 1 14 15 90 90 84 1 15 70 1 68 20 19 10 10 10 9 1 25 43 43 9 34 1 32 1 5 10 10 10 10 10 33 33 13 14 133 68 6 6 14 13 1 14 14 1 13 13 13 14 3 3 3 3 3 3 3 3 3 16 138 137 11 35 40 43 8 16 67 131 18 137 138 138 138 14 14 3 12 48 48 137 125 63 63 138 188 187 186 41 1 40 40 57 57 57 57 57 45 45 9 36 36 36 2 6 7 7 7 9 24 4 3 5 22 22 22 10 10 17 3 2 20 1 1 18 19 19 19 16 3 3 16 16 3 3 15 128 1 130 131 10 130 130 130 129 130 105 105 103 2 105 105 105 105 24 25 79 20 105 19 8 1 1 10 10 11 130 129 130 130 129 129 40 40 40 282 283 279 29 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <crypto/sha2.h> #include <crypto/utils.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/ip6_route.h> #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> #include <trace/events/sock.h> static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, enum linux_mptcp_mib_field field) { MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); } static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, void *hmac) { u8 msg[8]; put_unaligned_be32(nonce1, &msg[0]); put_unaligned_be32(nonce2, &msg[4]); mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); } static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && ((mptcp_pm_is_userspace(msk) && mptcp_userspace_pm_active(msk)) || READ_ONCE(msk->pm.accept_subflow)); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); subflow_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), subflow_req->local_nonce, subflow_req->remote_nonce, hmac); subflow_req->thmac = get_unaligned_be64(hmac); } static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_sock *msk; int local_id; msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; } local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); if (local_id < 0) { sock_put((struct sock *)msk); return NULL; } subflow_req->local_id = local_id; subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req); return msk; } static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); subflow_req->mp_capable = 0; subflow_req->mp_join = 0; subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener)); subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener)); subflow_req->msk = NULL; mptcp_token_init_request(req); } static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) { struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (mpext) { memset(mpext, 0, sizeof(*mpext)); mpext->reset_reason = reason; } } static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset * should be sent. */ static int subflow_check_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info)) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EINVAL; } #endif mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYN); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYN); if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); if (unlikely(listener->pm_listener)) return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); if (mp_opt.backup) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); } else if (unlikely(listener->pm_listener)) { return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; again: do { get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key)); } while (subflow_req->local_key == 0); if (unlikely(req->syncookie)) { mptcp_crypto_key_sha(subflow_req->local_key, &subflow_req->token, &subflow_req->idsn); if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } return 0; } err = mptcp_token_new_request(req); if (err == 0) subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; subflow_req->remote_id = mp_opt.join_id; subflow_req->token = mp_opt.token; subflow_req->remote_nonce = mp_opt.nonce; subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ if (!subflow_req->msk) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX); } subflow_req_create_thmac(subflow_req); if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } subflow_init_req_cookie_join_save(subflow_req, skb); } pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } return 0; } int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); mptcp_get_options(skb, &mp_opt); opt_mp_capable = !!(mp_opt.suboptions & OPTION_MPTCP_MPC_ACK); opt_mp_join = !!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK); if (opt_mp_capable && opt_mp_join) return -EINVAL; if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; subflow_req->local_key = mp_opt.rcvr_key; err = mptcp_token_new_request(req); if (err) return err; subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; subflow_req->mp_join = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; } return 0; } EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req); static enum sk_rst_reason mptcp_get_rst_reason(const struct sk_buff *skb) { const struct mptcp_ext *mpext = mptcp_get_ext(skb); if (!mpext) return SK_RST_REASON_NOT_SPECIFIED; return sk_rst_convert_mptcp_reason(mpext->reset_reason); } static struct dst_entry *subflow_v4_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_request_sock *ireq = inet_rsk(req); /* clear tstamp_ok, as needed depending on cookie */ if (foc && foc->len > -1) ireq->tstamp_ok = 0; if (synack_type == TCP_SYNACK_FASTOPEN) mptcp_fastopen_subflow_synack_set_params(subflow, req); } static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { subflow_prep_synack(sk, req, foc, synack_type); return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, synack_type, syn_skb); } static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { struct dst_entry *dst; int err; tcp_rsk(req)->is_mptcp = 1; subflow_init_req(req, sk); dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn); if (!dst) return NULL; err = subflow_check_req(req, sk, skb); if (err == 0) return dst; dst_release(dst); if (!req->syncookie) tcp6_request_sock_ops.send_reset(sk, skb, mptcp_get_rst_reason(skb)); return NULL; } #endif /* validate received truncated hmac and create hmac for third ACK */ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) { u8 hmac[SHA256_DIGEST_SIZE]; u64 thmac; subflow_generate_hmac(subflow->remote_key, subflow->local_key, subflow->remote_nonce, subflow->local_nonce, hmac); thmac = get_unaligned_be64(hmac); pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", subflow, subflow->token, thmac, subflow->thmac); return thmac == subflow->thmac; } void mptcp_subflow_reset(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; /* mptcp_mp_fail_no_response() can reach here on an already closed * socket */ if (ssk->sk_state == TCP_CLOSE) return; /* must hold: tcp_done() could drop last reference on parent */ sock_hold(sk); mptcp_send_active_reset_reason(ssk); tcp_done(ssk); if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk) { return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } void __mptcp_sync_state(struct sock *sk, int state) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk = msk->first; subflow = mptcp_subflow_ctx(ssk); __mptcp_propagate_sndbuf(sk, ssk); if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, ssk); if (sk->sk_state == TCP_SYN_SENT) { /* subflow->idsn is always available is TCP_SYN_SENT state, * even for the FASTOPEN scenarios */ WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_set_state(sk, state); sk->sk_state_change(sk); } } static void subflow_set_remote_key(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { /* active MPC subflow will reach here multiple times: * at subflow_finish_connect() time and at 4th ack time */ if (subflow->remote_key_valid) return; subflow->remote_key_valid = 1; subflow->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); } static void mptcp_propagate_state(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(sk); mptcp_data_lock(sk); if (mp_opt) { /* Options are available only in the non fallback cases * avoid updating rx path fields otherwise */ WRITE_ONCE(msk->snd_una, subflow->idsn + 1); WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); subflow_set_remote_key(msk, subflow, mp_opt); } if (!sock_owned_by_user(sk)) { __mptcp_sync_state(sk, ssk->sk_state); } else { msk->pending_state = ssk->sk_state; __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); } mptcp_data_unlock(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; struct mptcp_sock *msk; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; msk = mptcp_sk(parent); subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { if (!mptcp_try_fallback(sk, MPTCP_MIB_MPCAPABLEACTIVEFALLBACK)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FALLBACKFAILED); goto do_reset; } goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); if (mp_opt.deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_active_enable(parent); mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_SYNACK)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } if (!mptcp_finish_join(sk)) goto do_reset; subflow_generate_hmac(subflow->local_key, subflow->remote_key, subflow->local_nonce, subflow->remote_nonce, hmac); memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN); subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); if (subflow->backup) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { /* It looks like MPTCP is blocked, while TCP is not */ if (subflow->mpc_drop) mptcp_active_disable(parent); fallback: mptcp_propagate_state(parent, sk, subflow, NULL); } return; do_reset: subflow->reset_transient = 0; mptcp_subflow_reset(sk); } static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id) { WARN_ON_ONCE(local_id < 0 || local_id > 255); WRITE_ONCE(subflow->local_id, local_id); } static int subflow_chk_local_id(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int err; if (likely(subflow->local_id >= 0)) return 0; err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); if (err < 0) return err; subflow_set_local_id(subflow, err); subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk); return 0; } static int subflow_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet_sk_rebuild_header(sk); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static int subflow_v6_rebuild_header(struct sock *sk) { int err = subflow_chk_local_id(sk); if (unlikely(err < 0)) return err; return inet6_sk_rebuild_header(sk); } #endif static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: tcp_listendrop(sk); return 0; } static void subflow_v4_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp_request_sock_ops.destructor(req); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; static struct proto tcpv6_prot_override __ro_after_init; static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void subflow_v6_req_destructor(struct request_sock *req) { subflow_req_destructor(req); tcp6_request_sock_ops.destructor(req); } #endif struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { if (ops->family == AF_INET) ops = &mptcp_subflow_v4_request_sock_ops; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ops->family == AF_INET6) ops = &mptcp_subflow_v6_request_sock_ops; #endif return inet_reqsk_alloc(ops, sk_listener, attach_listener); } EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), subflow_req->remote_nonce, subflow_req->local_nonce, hmac); return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { struct inet_connection_sock *icsk = inet_csk(sk); mptcp_subflow_tcp_fallback(sk, old_ctx); icsk->icsk_ulp_ops = NULL; rcu_assign_pointer(icsk->icsk_ulp_data, NULL); tcp_sk(sk)->is_mptcp = 0; mptcp_subflow_ops_undo_override(sk); } void mptcp_subflow_drop_ctx(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); if (!ctx) return; list_del(&mptcp_subflow_ctx(ssk)->node); if (inet_csk(ssk)->icsk_ulp_ops) { subflow_ulp_fallback(ssk, ctx); if (ctx->conn) sock_put(ctx->conn); } kfree_rcu(ctx, rcu); } void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; enum sk_rst_reason reason; struct mptcp_sock *owner; struct sock *child; pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed */ mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join; fallback = !tcp_rsk(req)->is_mptcp; if (fallback) goto create_child; /* if the sk is MP_CAPABLE, we try to fetch the client key */ if (subflow_req->mp_capable) { /* we can receive and accept an in-window, out-of-order pkt, * which may not carry the MP_CAPABLE opt even on mptcp enabled * paths: always try to extract the peer key, and fallback * for packets missing it. * Even OoO DSS packets coming legitly after dropped or * reordered MPC will cause fallback, but we don't have other * options. */ mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_ACK))) fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK)) fallback = true; } create_child: child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); tcp_rsk(req)->drop_req = false; /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. */ if (!ctx || fallback) { if (fallback_is_fatal) { subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } goto fallback; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req); if (!ctx->conn) goto fallback; ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); if (mp_opt.deny_join_id0) WRITE_ONCE(owner->pm.remote_deny_join_id0, true); mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress * mpc option */ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_pm_fully_established(owner, child); ctx->pm_notified = 1; } } else if (ctx->mp_join) { owner = subflow_req->msk; if (!owner) { subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } if (!mptcp_can_accept_new_subflow(owner)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX); } if (!mptcp_finish_join(child)) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(child); subflow_add_reset_reason(skb, subflow->reset_reason); goto dispose_child; } SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); tcp_rsk(req)->drop_req = true; } } /* check for expected invariant - should never trigger, just help * catching earlier subtle bugs */ WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp && (!mptcp_subflow_ctx(child) || !mptcp_subflow_ctx(child)->conn)); return child; dispose_child: mptcp_subflow_drop_ctx(child); tcp_rsk(req)->drop_req = true; inet_csk_prepare_for_destroy_sock(child); tcp_done(child); reason = mptcp_get_rst_reason(skb); req->rsk_ops->send_reset(sk, skb, reason); /* The last child reference will be released by the caller */ return child; fallback: if (fallback) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); mptcp_subflow_drop_ctx(child); return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; static struct proto tcp_prot_override __ro_after_init; enum mapping_status { MAPPING_OK, MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, MAPPING_BAD_CSUM, MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; if (unlikely(skb_consumed >= skb->len)) { DEBUG_NET_WARN_ON_ONCE(1); return true; } return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); } static bool validate_mapping(struct sock *ssk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; if (unlikely(before(ssn, subflow->map_subflow_seq))) { /* Mapping covers data later in the subflow stream, * currently unsupported. */ dbg_bad_map(subflow, ssn); return false; } if (unlikely(!before(ssn, subflow->map_subflow_seq + subflow->map_data_len))) { /* Mapping does covers past subflow data, invalid */ dbg_bad_map(subflow, ssn); return false; } return true; } static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb, bool csum_reqd) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; __sum16 csum; int len; if (!csum_reqd) return MAPPING_OK; /* mapping already validated on previous traversal */ if (subflow->map_csum_len == subflow->map_data_len) return MAPPING_OK; /* traverse the receive queue, ensuring it contains a full * DSS mapping and accumulating the related csum. * Preserve the accoumlate csum across multiple calls, to compute * the csum only once */ delta = subflow->map_data_len - subflow->map_csum_len; for (;;) { seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len; offset = seq - TCP_SKB_CB(skb)->seq; /* if the current skb has not been accounted yet, csum its contents * up to the amount covered by the current DSS */ if (offset < skb->len) { __wsum csum; len = min(skb->len - offset, delta); csum = skb_checksum(skb, offset, len, 0); subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum, subflow->map_csum_len); delta -= len; subflow->map_csum_len += len; } if (delta == 0) break; if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) { /* if this subflow is closed, the partial mapping * will be never completed; flush the pending skbs, so * that subflow_sched_work_if_closed() can kick in */ if (unlikely(ssk->sk_state == TCP_CLOSE)) while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); /* not enough data to validate the csum */ return MAPPING_EMPTY; } /* the DSS mapping for next skbs will be validated later, * when a get_mapping_status call will process such skb */ skb = skb->next; } /* note that 'map_data_len' accounts only for the carried data, does * not include the eventual seq increment due to the data fin, * while the pseudo header requires the original DSS data len, * including that */ csum = __mptcp_make_csum(subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len + subflow->map_data_fin, subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; return MAPPING_OK; } static enum mapping_status get_mapping_status(struct sock *ssk, struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; u64 map_seq; skb = skb_peek(&ssk->sk_receive_queue); if (!skb) return MAPPING_EMPTY; if (mptcp_check_fallback(ssk)) return MAPPING_DUMMY; mpext = mptcp_get_ext(skb); if (!mpext || !mpext->use_map) { if (!subflow->map_valid && !skb->len) { /* the TCP stack deliver 0 len FIN pkt to the receive * queue, that is the only 0len pkts ever expected here, * and we can admit no mapping only for 0 len pkts */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) WARN_ONCE(1, "0len seq %d:%d flags %x", TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, TCP_SKB_CB(skb)->tcp_flags); sk_eat_skb(ssk, skb); return MAPPING_EMPTY; } /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) return MAPPING_NODSS; goto validate_seq; } trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); return MAPPING_INVALID; } if (mpext->data_fin == 1) { u64 data_fin_seq; if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping * has been fully consumed. Continue * handling the existing mapping. */ skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } if (updated) mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } data_fin_seq = mpext->data_seq + data_len - 1; /* If mpext->data_seq is a 32-bit value, data_fin_seq must also * be limited to 32 bits. */ if (!mpext->dsn64) data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", data_fin_seq, mpext->dsn64); /* Adjust for DATA_FIN using 1 byte of sequence space */ data_len--; } map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ if (subflow->map_seq == map_seq && subflow->map_subflow_seq == mpext->subflow_seq && subflow->map_data_len == data_len && subflow->map_csum_reqd == mpext->csum_reqd) { skb_ext_del(skb, SKB_EXT_MPTCP); goto validate_csum; } /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ if (skb_is_fully_mapped(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; } /* will validate the next map after consuming the current one */ goto validate_csum; } subflow->map_seq = map_seq; subflow->map_subflow_seq = mpext->subflow_seq; subflow->map_data_len = data_len; subflow->map_valid = 1; subflow->map_data_fin = mpext->data_fin; subflow->mpc_map = mpext->mpc_map; subflow->map_csum_reqd = mpext->csum_reqd; subflow->map_csum_len = 0; subflow->map_data_csum = csum_unfold(mpext->csum); /* Cfr RFC 8684 Section 3.3.0 */ if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); validate_seq: /* we revalidate valid mapping on new skb, because we must ensure * the current skb is completely covered by the available mapping */ if (!validate_mapping(ssk, skb)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH); return MAPPING_INVALID; } skb_ext_del(skb, SKB_EXT_MPTCP); validate_csum: return validate_data_csum(ssk, skb, csum_reqd); } static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, u64 limit) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; struct tcp_sock *tp = tcp_sk(ssk); u32 offset, incr, avail_len; offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; if (WARN_ON_ONCE(offset > skb->len)) goto out; avail_len = skb->len - offset; incr = limit >= avail_len ? avail_len + fin : limit; pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; out: if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq)) sk_eat_skb(ssk, skb); if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) subflow->map_valid = 0; } static bool subflow_is_done(const struct sock *sk) { return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; } /* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (likely(ssk->sk_state != TCP_CLOSE && (ssk->sk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; if (!skb_queue_empty(&ssk->sk_receive_queue)) return; if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) mptcp_schedule_work(sk); /* when the fallback subflow closes the rx side, trigger a 'dummy' * ingress data fin, so that the msk state will follow along */ if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && msk->first == ssk && mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) mptcp_schedule_work(sk); } static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; /* we are really failing, prevent any later subflow join */ spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); return false; } msk->allow_subflows = false; spin_unlock_bh(&msk->fallback_lock); /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) return false; /* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set */ if (sock_flag((struct sock *)msk, SOCK_DEAD)) return true; /* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all; */ fail_tout = jiffies + TCP_RTO_MAX; if (!fail_tout) fail_tout = 1; WRITE_ONCE(subflow->fail_tout, fail_tout); tcp_send_ack(ssk); mptcp_reset_tout_timer(msk, subflow->fail_tout); return true; } static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); enum mapping_status status; struct mptcp_sock *msk; struct sk_buff *skb; if (!skb_peek(&ssk->sk_receive_queue)) WRITE_ONCE(subflow->data_avail, false); if (subflow->data_avail) return true; msk = mptcp_sk(subflow->conn); for (;;) { u64 ack_seq; u64 old_ack; status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) goto no_data; skb = skb_peek(&ssk->sk_receive_queue); if (WARN_ON_ONCE(!skb)) goto no_data; if (unlikely(!READ_ONCE(msk->can_ack))) goto fallback; old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); continue; } WRITE_ONCE(subflow->data_avail, true); break; } return true; no_data: subflow_sched_work_if_closed(msk, ssk); return false; fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ if (status == MAPPING_BAD_CSUM && (subflow->mp_join || subflow->valid_csum_seen)) { subflow->send_mp_fail = 1; if (!mptcp_subflow_fail(msk, ssk)) { subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; goto reset; } WRITE_ONCE(subflow->data_avail, true); return true; } if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSFALLBACK)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; subflow->reset_reason = status == MAPPING_NODSS ? MPTCP_RST_EMIDDLEBOX : MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); mptcp_send_active_reset_reason(ssk); WRITE_ONCE(subflow->data_avail, false); return false; } } skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; WRITE_ONCE(subflow->data_avail, true); return true; } bool mptcp_subflow_data_available(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* check if current mapping is still valid */ if (subflow->map_valid && mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) { subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } return subflow_check_data_avail(sk); } /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, * not the ssk one. * * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static void subflow_error_report(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; /* bail early if this is a no-op, so that we avoid introducing a * problematic lockdep dependency between TCP accept queue lock * and msk socket spinlock */ if (!sk->sk_socket) return; mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); u16 state = 1 << inet_sk_state_load(sk); struct sock *parent = subflow->conn; struct mptcp_sock *msk; trace_sk_data_ready(sk); msk = mptcp_sk(parent); if (state & TCPF_LISTEN) { /* MPJ subflow are removed from accept queue before reaching here, * avoid stray wakeups */ if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue)) return; parent->sk_data_ready(parent); return; } WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable && !subflow->mp_join && !(state & TCPF_CLOSE)); if (mptcp_subflow_data_available(sk)) { mptcp_data_ready(parent, sk); /* subflow-level lowat test are not relevant. * respect the msk-level threshold eventually mandating an immediate ack */ if (mptcp_data_avail(msk) < parent->sk_rcvlowat && (tcp_sk(sk)->rcv_nxt - tcp_sk(sk)->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } else if (unlikely(sk->sk_err)) { subflow_error_report(sk); } } static void subflow_write_space(struct sock *ssk) { struct sock *sk = mptcp_subflow_ctx(ssk)->conn; mptcp_propagate_sndbuf(sk, ssk); mptcp_write_space(sk); } static const struct inet_connection_sock_af_ops * subflow_default_af_ops(struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_family == AF_INET6) return &subflow_v6_specific; #endif return &subflow_specific; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock_af_ops *target; target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) return; subflow->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = target; } #endif void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family) { memset(addr, 0, sizeof(*addr)); addr->ss_family = family; if (addr->ss_family == AF_INET) { struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; if (info->family == AF_INET) in_addr->sin_addr = info->addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ipv6_addr_v4mapped(&info->addr6)) in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3]; #endif in_addr->sin_port = info->port; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; if (info->family == AF_INET) ipv6_addr_set_v4mapped(info->addr.s_addr, &in6_addr->sin6_addr); else in6_addr->sin6_addr = info->addr6; in6_addr->sin6_port = info->port; } #endif } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; int local_id = local->addr.id; struct sockaddr_storage addr; int remote_id = remote->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; /* The userspace PM sent the request too early? */ if (!mptcp_is_fully_established(sk)) goto err_out; err = mptcp_subflow_create_socket(sk, local->addr.family, &sf); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR); pr_debug("msk=%p local=%d remote=%d create sock error: %d\n", msk, local_id, remote_id, err); goto err_out; } ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); do { get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); /* if 'IPADDRANY', the ID will be set later, after the routing */ if (local->addr.family == AF_INET) { if (!local->addr.addr.s_addr) local_id = -1; #if IS_ENABLED(CONFIG_MPTCP_IPV6) } else if (sk->sk_family == AF_INET6) { if (ipv6_addr_any(&local->addr.addr6)) local_id = -1; #endif } if (local_id >= 0) subflow_set_local_id(subflow, local_id); subflow->remote_key_valid = 1; subflow->remote_key = READ_ONCE(msk->remote_key); subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = local->ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); pr_debug("msk=%p local=%d remote=%d bind error: %d\n", msk, local_id, remote_id, err); goto failed; } mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); pr_debug("msk=%p local=%d remote=%d connect error: %d\n", msk, local_id, remote_id, err); goto failed_unlink; } MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX); /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); mptcp_stop_tout_timer(sk); return 0; failed_unlink: list_del(&subflow->node); sock_put(mptcp_subflow_tcp_sock(subflow)); failed: subflow->disposable = 1; sock_release(sf); err_out: /* we account subflows before the creation, and this failures will not * be caught by sk_state_change() */ mptcp_pm_close_subflow(msk); return err; } static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) { #ifdef CONFIG_SOCK_CGROUP_DATA struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, *child_skcd = &child->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ if (mem_cgroup_sockets_enabled) mem_cgroup_sk_inherit(parent, child); } static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot) ssk->sk_prot = &tcpv6_prot_override; else #endif ssk->sk_prot = &tcp_prot_override; } static void mptcp_subflow_ops_undo_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (ssk->sk_prot == &tcpv6_prot_override) ssk->sk_prot = &tcpv6_prot; else #endif ssk->sk_prot = &tcp_prot; } int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock) { struct mptcp_subflow_context *subflow; struct net *net = sock_net(sk); struct socket *sf; int err; /* un-accepted server sockets can reach here - on bad configuration * bail early to avoid greater trouble later */ if (unlikely(!sk->sk_socket)) return -EINVAL; err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf); if (err) return err; lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); err = security_mptcp_add_subflow(sk, sf->sk); if (err) goto err_free; /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct * user. */ SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); subflow->conn = sk; mptcp_subflow_ops_override(sf->sk); return 0; err_free: release_sock(sf->sk); sock_release(sf); return err; } static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, gfp_t priority) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; ctx = kzalloc(sizeof(*ctx), priority); if (!ctx) return NULL; rcu_assign_pointer(icsk->icsk_ulp_data, ctx); INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); return ctx; } static void __subflow_state_change(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_all(&wq->wait); rcu_read_unlock(); } static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; __subflow_state_change(sk); if (subflow_simultaneous_connect(sk)) { WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); subflow->conn_finished = 1; mptcp_propagate_state(parent, sk, subflow, NULL); } /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. */ if (mptcp_subflow_data_available(sk)) mptcp_data_ready(parent, sk); else if (unlikely(sk->sk_err)) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) { struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; struct request_sock *req, *head, *tail; struct mptcp_subflow_context *subflow; struct sock *sk, *ssk; /* Due to lock dependencies no relevant lock can be acquired under rskq_lock. * Splice the req list, so that accept() can not reach the pending ssk after * the listener socket is released below. */ spin_lock_bh(&queue->rskq_lock); head = queue->rskq_accept_head; tail = queue->rskq_accept_tail; queue->rskq_accept_head = NULL; queue->rskq_accept_tail = NULL; spin_unlock_bh(&queue->rskq_lock); if (!head) return; /* can't acquire the msk socket lock under the subflow one, * or will cause ABBA deadlock */ release_sock(listener_ssk); for (req = head; req; req = req->dl_next) { ssk = req->sk; if (!sk_is_mptcp(ssk)) continue; subflow = mptcp_subflow_ctx(ssk); if (!subflow || !subflow->conn) continue; sk = subflow->conn; sock_hold(sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); __mptcp_unaccepted_force_close(sk); release_sock(sk); /* lockdep will report a false positive ABBA deadlock * between cancel_work_sync and the listener socket. * The involved locks belong to different sockets WRT * the existing AB chain. * Using a per socket key is problematic as key * deregistration requires process context and must be * performed at socket disposal time, in atomic * context. * Just tell lockdep to consider the listener socket * released here. */ mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); mptcp_cancel_work(sk); mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); sock_put(sk); } /* we are still under the listener msk socket lock */ lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); /* restore the listener queue, to let the TCP code clean it up */ spin_lock_bh(&queue->rskq_lock); WARN_ON_ONCE(queue->rskq_accept_head); queue->rskq_accept_head = head; queue->rskq_accept_tail = tail; spin_unlock_bh(&queue->rskq_lock); } static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct mptcp_subflow_context *ctx; struct tcp_sock *tp = tcp_sk(sk); int err = 0; /* disallow attaching ULP to a socket unless it has been * created with sock_create_kern() */ if (!sk->sk_kern_sock) { err = -EOPNOTSUPP; goto out; } ctx = subflow_create_ctx(sk, GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; icsk->icsk_af_ops = subflow_default_af_ops(sk); ctx->tcp_state_change = sk->sk_state_change; ctx->tcp_error_report = sk->sk_error_report; WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable); WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space); sk->sk_data_ready = subflow_data_ready; sk->sk_write_space = subflow_write_space; sk->sk_state_change = subflow_state_change; sk->sk_error_report = subflow_error_report; out: return err; } static void subflow_ulp_release(struct sock *ssk) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); bool release = true; struct sock *sk; if (!ctx) return; sk = ctx->conn; if (sk) { /* if the msk has been orphaned, keep the ctx * alive, will be freed by __mptcp_close_ssk(), * when the subflow is still unaccepted */ release = ctx->disposable || list_empty(&ctx->node); /* inet_child_forget() does not call sk_state_change(), * explicitly trigger the socket close machinery */ if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) mptcp_schedule_work(sk); sock_put(sk); } mptcp_subflow_ops_undo_override(ssk); if (release) kfree_rcu(ctx, rcu); } static void subflow_ulp_clone(const struct request_sock *req, struct sock *newsk, const gfp_t priority) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; if (!tcp_rsk(req)->is_mptcp || (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx = subflow_create_ctx(newsk, priority); if (!new_ctx) { subflow_ulp_fallback(newsk, old_ctx); return; } new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection * is fully established only after we receive the remote key */ new_ctx->mp_capable = 1; new_ctx->local_key = subflow_req->local_key; new_ctx->token = subflow_req->token; new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->idsn = subflow_req->idsn; /* this is the first subflow, id is always 0 */ subflow_set_local_id(new_ctx, 0); } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->request_bkup = subflow_req->request_bkup; WRITE_ONCE(new_ctx->remote_id, subflow_req->remote_id); new_ctx->token = subflow_req->token; new_ctx->thmac = subflow_req->thmac; /* the subflow req id is valid, fetched via subflow_check_req() * and subflow_token_join_request() */ subflow_set_local_id(new_ctx, subflow_req->local_id); } } static void tcp_release_cb_override(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); long status; /* process and clear all the pending actions, but leave the subflow into * the napi queue. To respect locking, only the same CPU that originated * the action can touch the list. mptcp_napi_poll will take care of it. */ status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0); if (status) mptcp_subflow_process_delegated(ssk, status); tcp_release_cb(ssk); } static int tcp_abort_override(struct sock *ssk, int err) { /* closing a listener subflow requires a great deal of care. * keep it simple and just prevent such operation */ if (inet_sk_state_load(ssk) == TCP_LISTEN) return -EINVAL; return tcp_abort(ssk, err); } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { .name = "mptcp", .owner = THIS_MODULE, .init = subflow_ulp_init, .release = subflow_ulp_release, .clone = subflow_ulp_clone, }; static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, NULL); if (!subflow_ops->slab) return -ENOMEM; return 0; } void __init mptcp_subflow_init(void) { mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; subflow_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_specific.sk_rx_dst_set = subflow_finish_connect; subflow_specific.rebuild_header = subflow_rebuild_header; tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock * structures for v4 and v6 have the same size. It should not changed in * the future but better to make sure to be warned if it is no longer * the case. */ BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v6 request sock ops\n"); subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock; subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect; subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header; subflow_v6m_specific = subflow_v6_specific; subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit; subflow_v6m_specific.send_check = ipv4_specific.send_check; subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len; subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced; subflow_v6m_specific.rebuild_header = subflow_rebuild_header; tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; #endif mptcp_diag_subflow_init(&subflow_ulp_ops); if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); }
14 3 14 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions and Declarations for tuple. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_tuple.h */ #ifndef _NF_CONNTRACK_TUPLE_H #define _NF_CONNTRACK_TUPLE_H #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/list_nulls.h> /* A `tuple' is a structure containing the information to uniquely identify a connection. ie. if two packets have the same tuple, they are in the same connection; if not, they are not. We divide the structure along "manipulatable" and "non-manipulatable" lines, for the benefit of the NAT code. */ #define NF_CT_TUPLE_L3SIZE ARRAY_SIZE(((union nf_inet_addr *)NULL)->all) /* The manipulable part of the tuple. */ struct nf_conntrack_man { union nf_inet_addr u3; union nf_conntrack_man_proto u; /* Layer 3 protocol */ u_int16_t l3num; }; /* This contains the information to distinguish a connection. */ struct nf_conntrack_tuple { struct nf_conntrack_man src; /* These are the parts of the tuple which are fixed. */ struct { union nf_inet_addr u3; union { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { u_int8_t type, code; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; } gre; } u; /* The protocol. */ u_int8_t protonum; /* The direction must be ignored for the tuplehash */ struct { } __nfct_hash_offsetend; /* The direction (for tuplehash) */ u_int8_t dir; } dst; }; struct nf_conntrack_tuple_mask { struct { union nf_inet_addr u3; union nf_conntrack_man_proto u; } src; }; static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t) { #ifdef DEBUG printk("tuple %p: %u %pI4:%hu -> %pI4:%hu\n", t, t->dst.protonum, &t->src.u3.ip, ntohs(t->src.u.all), &t->dst.u3.ip, ntohs(t->dst.u.all)); #endif } static inline void nf_ct_dump_tuple_ipv6(const struct nf_conntrack_tuple *t) { #ifdef DEBUG printk("tuple %p: %u %pI6 %hu -> %pI6 %hu\n", t, t->dst.protonum, t->src.u3.all, ntohs(t->src.u.all), t->dst.u3.all, ntohs(t->dst.u.all)); #endif } static inline void nf_ct_dump_tuple(const struct nf_conntrack_tuple *t) { switch (t->src.l3num) { case AF_INET: nf_ct_dump_tuple_ip(t); break; case AF_INET6: nf_ct_dump_tuple_ipv6(t); break; } } /* If we're the first tuple, it's the original dir. */ #define NF_CT_DIRECTION(h) \ ((enum ip_conntrack_dir)(h)->tuple.dst.dir) /* Connections have two entries in the hash table: one for each way */ struct nf_conntrack_tuple_hash { struct hlist_nulls_node hnnode; struct nf_conntrack_tuple tuple; }; static inline bool __nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return (nf_inet_addr_cmp(&t1->src.u3, &t2->src.u3) && t1->src.u.all == t2->src.u.all && t1->src.l3num == t2->src.l3num); } static inline bool __nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return (nf_inet_addr_cmp(&t1->dst.u3, &t2->dst.u3) && t1->dst.u.all == t2->dst.u.all && t1->dst.protonum == t2->dst.protonum); } static inline bool nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return __nf_ct_tuple_src_equal(t1, t2) && __nf_ct_tuple_dst_equal(t1, t2); } static inline bool nf_ct_tuple_mask_equal(const struct nf_conntrack_tuple_mask *m1, const struct nf_conntrack_tuple_mask *m2) { return (nf_inet_addr_cmp(&m1->src.u3, &m2->src.u3) && m1->src.u.all == m2->src.u.all); } static inline bool nf_ct_tuple_src_mask_cmp(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2, const struct nf_conntrack_tuple_mask *mask) { int count; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++) { if ((t1->src.u3.all[count] ^ t2->src.u3.all[count]) & mask->src.u3.all[count]) return false; } if ((t1->src.u.all ^ t2->src.u.all) & mask->src.u.all) return false; if (t1->src.l3num != t2->src.l3num || t1->dst.protonum != t2->dst.protonum) return false; return true; } static inline bool nf_ct_tuple_mask_cmp(const struct nf_conntrack_tuple *t, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { return nf_ct_tuple_src_mask_cmp(t, tuple, mask) && __nf_ct_tuple_dst_equal(t, tuple); } #endif /* _NF_CONNTRACK_TUPLE_H */
41 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef __LINUX_OVERFLOW_H #define __LINUX_OVERFLOW_H #include <linux/compiler.h> #include <linux/limits.h> #include <linux/const.h> /* * We need to compute the minimum and maximum values representable in a given * type. These macros may also be useful elsewhere. It would seem more obvious * to do something like: * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) * * Unfortunately, the middle expressions, strictly speaking, have * undefined behaviour, and at least some versions of gcc warn about * the type_max expression (but not if -fsanitize=undefined is in * effect; in that case, the warning is deferred to runtime...). * * The slightly excessive casting in type_min is to make sure the * macros also produce sensible values for the exotic type _Bool. [The * overflow checkers only almost work for _Bool, but that's * a-feature-not-a-bug, since people shouldn't be doing arithmetic on * _Bools. Besides, the gcc builtins don't allow _Bool* as third * argument.] * * Idea stolen from * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define __type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_max(t) __type_max(typeof(t)) #define __type_min(T) ((T)((T)-type_max(T)-(T)1)) #define type_min(t) __type_min(typeof(t)) /* * Avoids triggering -Wtype-limits compilation warning, * while using unsigned data types to check a < 0. */ #define is_non_negative(a) ((a) > 0 || (a) == 0) #define is_negative(a) (!(is_non_negative(a))) /* * Allows for effectively applying __must_check to a macro so we can have * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ static inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } /** * check_add_overflow() - Calculate addition with overflow checking * @a: first addend * @b: second addend * @d: pointer to store sum * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted addition, regardless of whether * wrap-around occurred. */ #define check_add_overflow(a, b, d) \ __must_check_overflow(__builtin_add_overflow(a, b, d)) /** * wrapping_add() - Intentionally perform a wrapping addition * @type: type for result of calculation * @a: first addend * @b: second addend * * Return the potentially wrapped-around addition without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_add(type, a, b) \ ({ \ type __val; \ __builtin_add_overflow(a, b, &__val); \ __val; \ }) /** * wrapping_assign_add() - Intentionally perform a wrapping increment assignment * @var: variable to be incremented * @offset: amount to add * * Increments @var by @offset with wrap-around. Returns the resulting * value of @var. Will not trip any wrap-around sanitizers. * * Returns the new value of @var. */ #define wrapping_assign_add(var, offset) \ ({ \ typeof(var) *__ptr = &(var); \ *__ptr = wrapping_add(typeof(var), *__ptr, offset); \ }) /** * check_sub_overflow() - Calculate subtraction with overflow checking * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * @d: pointer to store difference * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted subtraction, regardless of whether * wrap-around occurred. */ #define check_sub_overflow(a, b, d) \ __must_check_overflow(__builtin_sub_overflow(a, b, d)) /** * wrapping_sub() - Intentionally perform a wrapping subtraction * @type: type for result of calculation * @a: minuend; value to subtract from * @b: subtrahend; value to subtract from @a * * Return the potentially wrapped-around subtraction without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_sub(type, a, b) \ ({ \ type __val; \ __builtin_sub_overflow(a, b, &__val); \ __val; \ }) /** * wrapping_assign_sub() - Intentionally perform a wrapping decrement assign * @var: variable to be decremented * @offset: amount to subtract * * Decrements @var by @offset with wrap-around. Returns the resulting * value of @var. Will not trip any wrap-around sanitizers. * * Returns the new value of @var. */ #define wrapping_assign_sub(var, offset) \ ({ \ typeof(var) *__ptr = &(var); \ *__ptr = wrapping_sub(typeof(var), *__ptr, offset); \ }) /** * check_mul_overflow() - Calculate multiplication with overflow checking * @a: first factor * @b: second factor * @d: pointer to store product * * Returns true on wrap-around, false otherwise. * * *@d holds the results of the attempted multiplication, regardless of whether * wrap-around occurred. */ #define check_mul_overflow(a, b, d) \ __must_check_overflow(__builtin_mul_overflow(a, b, d)) /** * wrapping_mul() - Intentionally perform a wrapping multiplication * @type: type for result of calculation * @a: first factor * @b: second factor * * Return the potentially wrapped-around multiplication without * tripping any wrap-around sanitizers that may be enabled. */ #define wrapping_mul(type, a, b) \ ({ \ type __val; \ __builtin_mul_overflow(a, b, &__val); \ __val; \ }) /** * check_shl_overflow() - Calculate a left-shifted value and check overflow * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * * Returns true if '*@d' cannot hold the result or when '@a << @s' doesn't * make sense. Example conditions: * * - '@a << @s' causes bits to be lost when stored in *@d. * - '@s' is garbage (e.g. negative) or so large that the result of * '@a << @s' is guaranteed to be 0. * - '@a' is negative. * - '@a << @s' sets the sign bit, if any, in '*@d'. * * '*@d' will hold the results of the attempted shift, but is not * considered "safe for use" if true is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ unsigned long long _a_full = _a; \ unsigned int _to_shift = \ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ (*_d >> _to_shift) != _a); \ })) #define __overflows_type_constexpr(x, T) ( \ is_unsigned_type(typeof(x)) ? \ (x) > type_max(T) : \ is_unsigned_type(typeof(T)) ? \ (x) < 0 || (x) > type_max(T) : \ (x) < type_min(T) || (x) > type_max(T)) #define __overflows_type(x, T) ({ \ typeof(T) v = 0; \ check_add_overflow((x), v, &v); \ }) /** * overflows_type - helper for checking the overflows between value, variables, * or data type * * @n: source constant value or variable to be checked * @T: destination variable or data type proposed to store @x * * Compares the @x expression for whether or not it can safely fit in * the storage of the type in @T. @x and @T can have different types. * If @x is a constant expression, this will also resolve to a constant * expression. * * Returns: true if overflow can occur, false otherwise. */ #define overflows_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ __overflows_type_constexpr(n, T), \ __overflows_type(n, T)) /** * range_overflows() - Check if a range is out of bounds * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * A strict check to determine if the range [@start, @start + @size) is * invalid with respect to the allowable range [0, @max). Any range * starting at or beyond @max is considered an overflow, even if @size is 0. * * Returns: true if the range is out of bounds. */ #define range_overflows(start, size, max) ({ \ typeof(start) start__ = (start); \ typeof(size) size__ = (size); \ typeof(max) max__ = (max); \ (void)(&start__ == &size__); \ (void)(&start__ == &max__); \ start__ >= max__ || size__ > max__ - start__; \ }) /** * range_overflows_t() - Check if a range is out of bounds * @type: Data type to use. * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * Same as range_overflows() but forcing the parameters to @type. * * Returns: true if the range is out of bounds. */ #define range_overflows_t(type, start, size, max) \ range_overflows((type)(start), (type)(size), (type)(max)) /** * range_end_overflows() - Check if a range's endpoint is out of bounds * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * Checks only if the endpoint of a range (@start + @size) exceeds @max. * Unlike range_overflows(), a zero-sized range at the boundary (@start == @max) * is not considered an overflow. Useful for iterator-style checks. * * Returns: true if the endpoint exceeds the boundary. */ #define range_end_overflows(start, size, max) ({ \ typeof(start) start__ = (start); \ typeof(size) size__ = (size); \ typeof(max) max__ = (max); \ (void)(&start__ == &size__); \ (void)(&start__ == &max__); \ start__ > max__ || size__ > max__ - start__; \ }) /** * range_end_overflows_t() - Check if a range's endpoint is out of bounds * @type: Data type to use. * @start: Start of the range. * @size: Size of the range. * @max: Exclusive upper boundary. * * Same as range_end_overflows() but forcing the parameters to @type. * * Returns: true if the endpoint exceeds the boundary. */ #define range_end_overflows_t(type, start, size, max) \ range_end_overflows((type)(start), (type)(size), (type)(max)) /** * castable_to_type - like __same_type(), but also allows for casted literals * * @n: variable or constant value * @T: variable or data type * * Unlike the __same_type() macro, this allows a constant value as the * first argument. If this value would not overflow into an assignment * of the second argument's type, it returns true. Otherwise, this falls * back to __same_type(). */ #define castable_to_type(n, T) \ __builtin_choose_expr(__is_constexpr(n), \ !__overflows_type_constexpr(n, T), \ __same_type(n, T)) /** * size_mul() - Calculate size_t multiplication with saturation at SIZE_MAX * @factor1: first factor * @factor2: second factor * * Returns: calculate @factor1 * @factor2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_mul(size_t factor1, size_t factor2) { size_t bytes; if (check_mul_overflow(factor1, factor2, &bytes)) return SIZE_MAX; return bytes; } /** * size_add() - Calculate size_t addition with saturation at SIZE_MAX * @addend1: first addend * @addend2: second addend * * Returns: calculate @addend1 + @addend2, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. The * lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_add(size_t addend1, size_t addend2) { size_t bytes; if (check_add_overflow(addend1, addend2, &bytes)) return SIZE_MAX; return bytes; } /** * size_sub() - Calculate size_t subtraction with saturation at SIZE_MAX * @minuend: value to subtract from * @subtrahend: value to subtract from @minuend * * Returns: calculate @minuend - @subtrahend, both promoted to size_t, * with any overflow causing the return value to be SIZE_MAX. For * composition with the size_add() and size_mul() helpers, neither * argument may be SIZE_MAX (or the result with be forced to SIZE_MAX). * The lvalue must be size_t to avoid implicit type conversion. */ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) { size_t bytes; if (minuend == SIZE_MAX || subtrahend == SIZE_MAX || check_sub_overflow(minuend, subtrahend, &bytes)) return SIZE_MAX; return bytes; } /** * array_size() - Calculate size of 2-dimensional array. * @a: dimension one * @b: dimension two * * Calculates size of 2-dimensional array: @a * @b. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array_size(a, b) size_mul(a, b) /** * array3_size() - Calculate size of 3-dimensional array. * @a: dimension one * @b: dimension two * @c: dimension three * * Calculates size of 3-dimensional array: @a * @b * @c. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ #define array3_size(a, b, c) size_mul(size_mul(a, b), c) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. * * Calculates size of a flexible array of @count number of @member * elements, at the end of structure @p. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define flex_array_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ (count) * sizeof(*(p)->member) + __must_be_array((p)->member), \ size_mul(count, sizeof(*(p)->member) + __must_be_array((p)->member))) /** * struct_size() - Calculate size of structure with trailing flexible array. * @p: Pointer to the structure. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure of @p followed by an * array of @count number of @member elements. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size(p, member, count) \ __builtin_choose_expr(__is_constexpr(count), \ sizeof(*(p)) + flex_array_size(p, member, count), \ size_add(sizeof(*(p)), flex_array_size(p, member, count))) /** * struct_size_t() - Calculate size of structure with trailing flexible array * @type: structure type name. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure @type followed by an * array of @count number of @member elements. Prefer using struct_size() * when possible instead, to keep calculations associated with a specific * instance variable of type @type. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size_t(type, member, count) \ struct_size((type *)NULL, member, count) /** * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass arbitrary trailing expressions * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * @trailer: Trailing expressions for attributes and/or initializers. */ #define __DEFINE_FLEX(type, name, member, count, trailer...) \ _Static_assert(__builtin_constant_p(count), \ "onstack flex array members require compile-time const count"); \ union { \ u8 bytes[struct_size_t(type, member, count)]; \ type obj; \ } name##_u trailer; \ type *name = (type *)&name##_u /** * _DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass (different) initializer. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * @initializer: Initializer expression (e.g., pass `= { }` at minimum). */ #define _DEFINE_FLEX(type, name, member, count, initializer...) \ __DEFINE_FLEX(type, name, member, count, = { .obj initializer }) /** * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member, when it does not have a __counted_by annotation. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. * @member: Name of the array member. * @count: Number of elements in the array; must be compile-time const. * * Define a zeroed, on-stack, instance of @type structure with a trailing * flexible array member. * Use __struct_size(@name) to get compile-time size of it afterwards. * Use __member_size(@name->member) to get compile-time size of @name members. * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of * elements in array @member. */ #define DEFINE_RAW_FLEX(type, name, member, count) \ __DEFINE_FLEX(type, name, member, count, = { }) /** * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing * flexible array member. * * @TYPE: structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @MEMBER: Name of the array member. * @COUNTER: Name of the __counted_by member. * @COUNT: Number of elements in the array; must be compile-time const. * * Define a zeroed, on-stack, instance of @TYPE structure with a trailing * flexible array member. * Use __struct_size(@NAME) to get compile-time size of it afterwards. * Use __member_size(@NAME->member) to get compile-time size of @NAME members. * Use STACK_FLEX_ARRAY_SIZE(@name, @member) to get compile-time number of * elements in array @member. */ #define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .COUNTER = COUNT, }) /** * STACK_FLEX_ARRAY_SIZE() - helper macro for DEFINE_FLEX() family. * Returns the number of elements in @array. * * @name: Name for a variable defined in DEFINE_RAW_FLEX()/DEFINE_FLEX(). * @array: Name of the array member. */ #define STACK_FLEX_ARRAY_SIZE(name, array) \ (__member_size((name)->array) / sizeof(*(name)->array) + \ __must_be_array((name)->array)) #endif /* __LINUX_OVERFLOW_H */
54 55 50 8 3 9 9 8 3 55 49 9 49 9 48 47 3 48 46 47 46 46 46 1 4 2 9 9 8 2 8 2 9 3 2 9 35 34 1 1 33 1 2 3 11 11 1 9 2 4 9 1 9 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 // SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include "cgroup-internal.h" #include <trace/events/cgroup.h> /* * Update CGRP_FROZEN of cgroup.flag * Return true if flags is updated; false if flags has no change */ static bool cgroup_update_frozen_flag(struct cgroup *cgrp, bool frozen) { lockdep_assert_held(&css_set_lock); /* Already there? */ if (test_bit(CGRP_FROZEN, &cgrp->flags) == frozen) return false; if (frozen) set_bit(CGRP_FROZEN, &cgrp->flags); else clear_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); return true; } /* * Propagate the cgroup frozen state upwards by the cgroup tree. */ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { int desc = 1; /* * If the new state is frozen, some freezing ancestor cgroups may change * their state too, depending on if all their descendants are frozen. * * Otherwise, all ancestor cgroups are forced into the non-frozen state. */ while ((cgrp = cgroup_parent(cgrp))) { if (frozen) { cgrp->freezer.nr_frozen_descendants += desc; if (!test_bit(CGRP_FREEZE, &cgrp->flags) || (cgrp->freezer.nr_frozen_descendants != cgrp->nr_descendants)) continue; } else { cgrp->freezer.nr_frozen_descendants -= desc; } if (cgroup_update_frozen_flag(cgrp, frozen)) desc++; } } /* * Revisit the cgroup frozen state. * Checks if the cgroup is really frozen and perform all state transitions. */ void cgroup_update_frozen(struct cgroup *cgrp) { bool frozen; /* * If the cgroup has to be frozen (CGRP_FREEZE bit set), * and all tasks are frozen and/or stopped, let's consider * the cgroup frozen. Otherwise it's not frozen. */ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); /* If flags is updated, update the state of ancestor cgroups. */ if (cgroup_update_frozen_flag(cgrp, frozen)) cgroup_propagate_frozen(cgrp, frozen); } /* * Increment cgroup's nr_frozen_tasks. */ static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks++; } /* * Decrement cgroup's nr_frozen_tasks. */ static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks--; WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } /* * Enter frozen/stopped state, if not yet there. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. */ void cgroup_enter_frozen(void) { struct cgroup *cgrp; if (current->frozen) return; spin_lock_irq(&css_set_lock); current->frozen = true; cgrp = task_dfl_cgroup(current); cgroup_inc_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Conditionally leave frozen/stopped state. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. * * If always_leave is not set, and the cgroup is freezing, * we're racing with the cgroup freezing. In this case, we don't * drop the frozen counter to avoid a transient switch to * the unfrozen state. */ void cgroup_leave_frozen(bool always_leave) { struct cgroup *cgrp; spin_lock_irq(&css_set_lock); cgrp = task_dfl_cgroup(current); if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { cgroup_dec_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); WARN_ON_ONCE(!current->frozen); current->frozen = false; } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { spin_lock(&current->sighand->siglock); current->jobctl |= JOBCTL_TRAP_FREEZE; set_thread_flag(TIF_SIGPENDING); spin_unlock(&current->sighand->siglock); } spin_unlock_irq(&css_set_lock); } /* * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE * jobctl bit. */ static void cgroup_freeze_task(struct task_struct *task, bool freeze) { unsigned long flags; /* If the task is about to die, don't bother with freezing it. */ if (!lock_task_sighand(task, &flags)) return; if (freeze) { task->jobctl |= JOBCTL_TRAP_FREEZE; signal_wake_up(task, false); } else { task->jobctl &= ~JOBCTL_TRAP_FREEZE; wake_up_process(task); } unlock_task_sighand(task, &flags); } /* * Freeze or unfreeze all tasks in the given cgroup. */ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); write_seqcount_begin(&cgrp->freezer.freeze_seq); if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); cgrp->freezer.freeze_start_nsec = ts_nsec; } else { clear_bit(CGRP_FREEZE, &cgrp->flags); cgrp->freezer.frozen_nsec += (ts_nsec - cgrp->freezer.freeze_start_nsec); } write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) TRACE_CGROUP_PATH(freeze, cgrp); else TRACE_CGROUP_PATH(unfreeze, cgrp); css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { /* * Ignore kernel threads here. Freezing cgroups containing * kthreads isn't supported. */ if (task->flags & PF_KTHREAD) continue; cgroup_freeze_task(task, freeze); } css_task_iter_end(&it); /* * Cgroup state should be revisited here to cover empty leaf cgroups * and cgroups which descendants are already in the desired state. */ spin_lock_irq(&css_set_lock); if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Adjust the task state (freeze or unfreeze) and revisit the state of * source and destination cgroups. */ void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst) { lockdep_assert_held(&css_set_lock); /* * Kernel threads are not supposed to be frozen at all. */ if (task->flags & PF_KTHREAD) return; /* * It's not necessary to do changes if both of the src and dst cgroups * are not freezing and task is not frozen. */ if (!test_bit(CGRP_FREEZE, &src->flags) && !test_bit(CGRP_FREEZE, &dst->flags) && !task->frozen) return; /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not * frozen, we bump both counters to keep them balanced. */ if (task->frozen) { cgroup_inc_frozen_cnt(dst); cgroup_dec_frozen_cnt(src); } cgroup_update_frozen(dst); cgroup_update_frozen(src); /* * Force the task to the desired state. */ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; struct cgroup *parent; struct cgroup *dsct; bool applied = false; u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); /* * Nothing changed? Just exit. */ if (cgrp->freezer.freeze == freeze) return; cgrp->freezer.freeze = freeze; ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. */ css_for_each_descendant_pre(css, &cgrp->self) { dsct = css->cgroup; if (cgroup_is_dead(dsct)) continue; /* * e_freeze is affected by parent's e_freeze and dst's freeze. * If old e_freeze eq new e_freeze, no change, its children * will not be affected. So do nothing and skip the subtree */ old_e = dsct->freezer.e_freeze; parent = cgroup_parent(dsct); dsct->freezer.e_freeze = (dsct->freezer.freeze || parent->freezer.e_freeze); if (dsct->freezer.e_freeze == old_e) { css = css_rightmost_descendant(css); continue; } /* * Do change actual state: freeze or unfreeze. */ cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } /* * Even if the actual state hasn't changed, let's notify a user. * The state can be enforced by an ancestor cgroup: the cgroup * can already be in the desired state or it can be locked in the * opposite state, so that the transition will never happen. * In both cases it's better to notify a user, that there is * nothing to wait for. */ if (!applied) { TRACE_CGROUP_PATH(notify_frozen, cgrp, test_bit(CGRP_FROZEN, &cgrp->flags)); cgroup_file_notify(&cgrp->events_file); } }
1 2 3 4 1 4 4 3 1 3 1 1 2 2 5 5 1 2 2 2 1 10 9 1 1 1 1 2 2 1 10 9 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 // SPDX-License-Identifier: GPL-2.0-only /* * Intel & MS High Precision Event Timer Implementation. * * Copyright (C) 2003 Intel Corporation * Venki Pallipadi * (c) Copyright 2004 Hewlett-Packard Development Company, L.P. * Bob Picco <robert.picco@hp.com> */ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/miscdevice.h> #include <linux/major.h> #include <linux/ioport.h> #include <linux/fcntl.h> #include <linux/init.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/poll.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/sysctl.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/bcd.h> #include <linux/seq_file.h> #include <linux/bitops.h> #include <linux/compat.h> #include <linux/clocksource.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/io.h> #include <linux/acpi.h> #include <linux/hpet.h> #include <asm/current.h> #include <asm/irq.h> #include <asm/div64.h> /* * The High Precision Event Timer driver. * This driver is closely modelled after the rtc.c driver. * See HPET spec revision 1. */ #define HPET_USER_FREQ (64) #define HPET_DRIFT (500) #define HPET_RANGE_SIZE 1024 /* from HPET spec */ /* WARNING -- don't get confused. These macros are never used * to write the (single) counter, and rarely to read it. * They're badly named; to fix, someday. */ #if BITS_PER_LONG == 64 #define write_counter(V, MC) writeq(V, MC) #define read_counter(MC) readq(MC) #else #define write_counter(V, MC) writel(V, MC) #define read_counter(MC) readl(MC) #endif static DEFINE_MUTEX(hpet_mutex); /* replaces BKL */ static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ; /* A lock for concurrent access by app and isr hpet activity. */ static DEFINE_SPINLOCK(hpet_lock); #define HPET_DEV_NAME (7) struct hpet_dev { struct hpets *hd_hpets; struct hpet __iomem *hd_hpet; struct hpet_timer __iomem *hd_timer; unsigned long hd_ireqfreq; unsigned long hd_irqdata; wait_queue_head_t hd_waitqueue; struct fasync_struct *hd_async_queue; unsigned int hd_flags; unsigned int hd_irq; unsigned int hd_hdwirq; char hd_name[HPET_DEV_NAME]; }; struct hpets { struct hpets *hp_next; struct hpet __iomem *hp_hpet; unsigned long hp_hpet_phys; unsigned long long hp_tick_freq; unsigned long hp_delta; unsigned int hp_ntimer; unsigned int hp_which; struct hpet_dev hp_dev[] __counted_by(hp_ntimer); }; static struct hpets *hpets; #define HPET_OPEN 0x0001 #define HPET_IE 0x0002 /* interrupt enabled */ #define HPET_PERIODIC 0x0004 #define HPET_SHARED_IRQ 0x0008 static irqreturn_t hpet_interrupt(int irq, void *data) { struct hpet_dev *devp; unsigned long isr; devp = data; isr = 1 << (devp - devp->hd_hpets->hp_dev); if ((devp->hd_flags & HPET_SHARED_IRQ) && !(isr & readl(&devp->hd_hpet->hpet_isr))) return IRQ_NONE; spin_lock(&hpet_lock); devp->hd_irqdata++; /* * For non-periodic timers, increment the accumulator. * This has the effect of treating non-periodic like periodic. */ if ((devp->hd_flags & (HPET_IE | HPET_PERIODIC)) == HPET_IE) { unsigned long t, mc, base, k; struct hpet __iomem *hpet = devp->hd_hpet; struct hpets *hpetp = devp->hd_hpets; t = devp->hd_ireqfreq; read_counter(&devp->hd_timer->hpet_compare); mc = read_counter(&hpet->hpet_mc); /* The time for the next interrupt would logically be t + m, * however, if we are very unlucky and the interrupt is delayed * for longer than t then we will completely miss the next * interrupt if we set t + m and an application will hang. * Therefore we need to make a more complex computation assuming * that there exists a k for which the following is true: * k * t + base < mc + delta * (k + 1) * t + base > mc + delta * where t is the interval in hpet ticks for the given freq, * base is the theoretical start value 0 < base < t, * mc is the main counter value at the time of the interrupt, * delta is the time it takes to write the a value to the * comparator. * k may then be computed as (mc - base + delta) / t . */ base = mc % t; k = (mc - base + hpetp->hp_delta) / t; write_counter(t * (k + 1) + base, &devp->hd_timer->hpet_compare); } if (devp->hd_flags & HPET_SHARED_IRQ) writel(isr, &devp->hd_hpet->hpet_isr); spin_unlock(&hpet_lock); wake_up_interruptible(&devp->hd_waitqueue); kill_fasync(&devp->hd_async_queue, SIGIO, POLL_IN); return IRQ_HANDLED; } static void hpet_timer_set_irq(struct hpet_dev *devp) { const unsigned int nr_irqs = irq_get_nr_irqs(); unsigned long v; int irq, gsi; struct hpet_timer __iomem *timer; spin_lock_irq(&hpet_lock); if (devp->hd_hdwirq) { spin_unlock_irq(&hpet_lock); return; } timer = devp->hd_timer; /* we prefer level triggered mode */ v = readl(&timer->hpet_config); if (!(v & Tn_INT_TYPE_CNF_MASK)) { v |= Tn_INT_TYPE_CNF_MASK; writel(v, &timer->hpet_config); } spin_unlock_irq(&hpet_lock); v = (readq(&timer->hpet_config) & Tn_INT_ROUTE_CAP_MASK) >> Tn_INT_ROUTE_CAP_SHIFT; /* * In PIC mode, skip IRQ0-4, IRQ6-9, IRQ12-15 which is always used by * legacy device. In IO APIC mode, we skip all the legacy IRQS. */ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) v &= ~0xf3df; else v &= ~0xffff; for_each_set_bit(irq, &v, HPET_MAX_IRQ) { if (irq >= nr_irqs) { irq = HPET_MAX_IRQ; break; } gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); if (gsi > 0) break; /* FIXME: Setup interrupt source table */ } if (irq < HPET_MAX_IRQ) { spin_lock_irq(&hpet_lock); v = readl(&timer->hpet_config); v |= irq << Tn_INT_ROUTE_CNF_SHIFT; writel(v, &timer->hpet_config); devp->hd_hdwirq = gsi; spin_unlock_irq(&hpet_lock); } return; } static int hpet_open(struct inode *inode, struct file *file) { struct hpet_dev *devp; struct hpets *hpetp; int i; if (file->f_mode & FMODE_WRITE) return -EINVAL; mutex_lock(&hpet_mutex); spin_lock_irq(&hpet_lock); for (devp = NULL, hpetp = hpets; hpetp && !devp; hpetp = hpetp->hp_next) for (i = 0; i < hpetp->hp_ntimer; i++) if (hpetp->hp_dev[i].hd_flags & HPET_OPEN) { continue; } else { devp = &hpetp->hp_dev[i]; break; } if (!devp) { spin_unlock_irq(&hpet_lock); mutex_unlock(&hpet_mutex); return -EBUSY; } file->private_data = devp; devp->hd_irqdata = 0; devp->hd_flags |= HPET_OPEN; spin_unlock_irq(&hpet_lock); mutex_unlock(&hpet_mutex); hpet_timer_set_irq(devp); return 0; } static ssize_t hpet_read(struct file *file, char __user *buf, size_t count, loff_t * ppos) { DECLARE_WAITQUEUE(wait, current); unsigned long data; ssize_t retval; struct hpet_dev *devp; devp = file->private_data; if (!devp->hd_ireqfreq) return -EIO; if (in_compat_syscall()) { if (count < sizeof(compat_ulong_t)) return -EINVAL; } else { if (count < sizeof(unsigned long)) return -EINVAL; } add_wait_queue(&devp->hd_waitqueue, &wait); for ( ; ; ) { set_current_state(TASK_INTERRUPTIBLE); spin_lock_irq(&hpet_lock); data = devp->hd_irqdata; devp->hd_irqdata = 0; spin_unlock_irq(&hpet_lock); if (data) { break; } else if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto out; } else if (signal_pending(current)) { retval = -ERESTARTSYS; goto out; } schedule(); } if (in_compat_syscall()) { retval = put_user(data, (compat_ulong_t __user *)buf); if (!retval) retval = sizeof(compat_ulong_t); } else { retval = put_user(data, (unsigned long __user *)buf); if (!retval) retval = sizeof(unsigned long); } out: __set_current_state(TASK_RUNNING); remove_wait_queue(&devp->hd_waitqueue, &wait); return retval; } static __poll_t hpet_poll(struct file *file, poll_table * wait) { unsigned long v; struct hpet_dev *devp; devp = file->private_data; if (!devp->hd_ireqfreq) return 0; poll_wait(file, &devp->hd_waitqueue, wait); spin_lock_irq(&hpet_lock); v = devp->hd_irqdata; spin_unlock_irq(&hpet_lock); if (v != 0) return EPOLLIN | EPOLLRDNORM; return 0; } #ifdef CONFIG_HPET_MMAP #ifdef CONFIG_HPET_MMAP_DEFAULT static int hpet_mmap_enabled = 1; #else static int hpet_mmap_enabled = 0; #endif static __init int hpet_mmap_enable(char *str) { get_option(&str, &hpet_mmap_enabled); pr_info("HPET mmap %s\n", hpet_mmap_enabled ? "enabled" : "disabled"); return 1; } __setup("hpet_mmap=", hpet_mmap_enable); static int hpet_mmap(struct file *file, struct vm_area_struct *vma) { struct hpet_dev *devp; unsigned long addr; if (!hpet_mmap_enabled) return -EACCES; devp = file->private_data; addr = devp->hd_hpets->hp_hpet_phys; if (addr & (PAGE_SIZE - 1)) return -ENOSYS; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return vm_iomap_memory(vma, addr, PAGE_SIZE); } #else static int hpet_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } #endif static int hpet_fasync(int fd, struct file *file, int on) { struct hpet_dev *devp; devp = file->private_data; if (fasync_helper(fd, file, on, &devp->hd_async_queue) >= 0) return 0; else return -EIO; } static int hpet_release(struct inode *inode, struct file *file) { struct hpet_dev *devp; struct hpet_timer __iomem *timer; int irq = 0; devp = file->private_data; timer = devp->hd_timer; spin_lock_irq(&hpet_lock); writeq((readq(&timer->hpet_config) & ~Tn_INT_ENB_CNF_MASK), &timer->hpet_config); irq = devp->hd_irq; devp->hd_irq = 0; devp->hd_ireqfreq = 0; if (devp->hd_flags & HPET_PERIODIC && readq(&timer->hpet_config) & Tn_TYPE_CNF_MASK) { unsigned long v; v = readq(&timer->hpet_config); v ^= Tn_TYPE_CNF_MASK; writeq(v, &timer->hpet_config); } devp->hd_flags &= ~(HPET_OPEN | HPET_IE | HPET_PERIODIC); spin_unlock_irq(&hpet_lock); if (irq) free_irq(irq, devp); file->private_data = NULL; return 0; } static int hpet_ioctl_ieon(struct hpet_dev *devp) { struct hpet_timer __iomem *timer; struct hpet __iomem *hpet; struct hpets *hpetp; int irq; unsigned long g, v, t, m; unsigned long flags, isr; timer = devp->hd_timer; hpet = devp->hd_hpet; hpetp = devp->hd_hpets; if (!devp->hd_ireqfreq) return -EIO; spin_lock_irq(&hpet_lock); if (devp->hd_flags & HPET_IE) { spin_unlock_irq(&hpet_lock); return -EBUSY; } devp->hd_flags |= HPET_IE; if (readl(&timer->hpet_config) & Tn_INT_TYPE_CNF_MASK) devp->hd_flags |= HPET_SHARED_IRQ; spin_unlock_irq(&hpet_lock); irq = devp->hd_hdwirq; if (irq) { unsigned long irq_flags; if (devp->hd_flags & HPET_SHARED_IRQ) { /* * To prevent the interrupt handler from seeing an * unwanted interrupt status bit, program the timer * so that it will not fire in the near future ... */ writel(readl(&timer->hpet_config) & ~Tn_TYPE_CNF_MASK, &timer->hpet_config); write_counter(read_counter(&hpet->hpet_mc), &timer->hpet_compare); /* ... and clear any left-over status. */ isr = 1 << (devp - devp->hd_hpets->hp_dev); writel(isr, &hpet->hpet_isr); } sprintf(devp->hd_name, "hpet%d", (int)(devp - hpetp->hp_dev)); irq_flags = devp->hd_flags & HPET_SHARED_IRQ ? IRQF_SHARED : 0; if (request_irq(irq, hpet_interrupt, irq_flags, devp->hd_name, (void *)devp)) { printk(KERN_ERR "hpet: IRQ %d is not free\n", irq); irq = 0; } } if (irq == 0) { spin_lock_irq(&hpet_lock); devp->hd_flags ^= HPET_IE; spin_unlock_irq(&hpet_lock); return -EIO; } devp->hd_irq = irq; t = devp->hd_ireqfreq; v = readq(&timer->hpet_config); /* 64-bit comparators are not yet supported through the ioctls, * so force this into 32-bit mode if it supports both modes */ g = v | Tn_32MODE_CNF_MASK | Tn_INT_ENB_CNF_MASK; if (devp->hd_flags & HPET_PERIODIC) { g |= Tn_TYPE_CNF_MASK; v |= Tn_TYPE_CNF_MASK | Tn_VAL_SET_CNF_MASK; writeq(v, &timer->hpet_config); local_irq_save(flags); /* * NOTE: First we modify the hidden accumulator * register supported by periodic-capable comparators. * We never want to modify the (single) counter; that * would affect all the comparators. The value written * is the counter value when the first interrupt is due. */ m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); /* * Then we modify the comparator, indicating the period * for subsequent interrupt. */ write_counter(t, &timer->hpet_compare); } else { local_irq_save(flags); m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); } if (devp->hd_flags & HPET_SHARED_IRQ) { isr = 1 << (devp - devp->hd_hpets->hp_dev); writel(isr, &hpet->hpet_isr); } writeq(g, &timer->hpet_config); local_irq_restore(flags); return 0; } /* converts Hz to number of timer ticks */ static inline unsigned long hpet_time_div(struct hpets *hpets, unsigned long dis) { unsigned long long m; m = hpets->hp_tick_freq + (dis >> 1); return div64_ul(m, dis); } static int hpet_ioctl_common(struct hpet_dev *devp, unsigned int cmd, unsigned long arg, struct hpet_info *info) { struct hpet_timer __iomem *timer; struct hpets *hpetp; int err; unsigned long v; switch (cmd) { case HPET_IE_OFF: case HPET_INFO: case HPET_EPI: case HPET_DPI: case HPET_IRQFREQ: timer = devp->hd_timer; hpetp = devp->hd_hpets; break; case HPET_IE_ON: return hpet_ioctl_ieon(devp); default: return -EINVAL; } err = 0; switch (cmd) { case HPET_IE_OFF: if ((devp->hd_flags & HPET_IE) == 0) break; v = readq(&timer->hpet_config); v &= ~Tn_INT_ENB_CNF_MASK; writeq(v, &timer->hpet_config); if (devp->hd_irq) { free_irq(devp->hd_irq, devp); devp->hd_irq = 0; } devp->hd_flags ^= HPET_IE; break; case HPET_INFO: { memset(info, 0, sizeof(*info)); if (devp->hd_ireqfreq) info->hi_ireqfreq = hpet_time_div(hpetp, devp->hd_ireqfreq); info->hi_flags = readq(&timer->hpet_config) & Tn_PER_INT_CAP_MASK; info->hi_hpet = hpetp->hp_which; info->hi_timer = devp - hpetp->hp_dev; break; } case HPET_EPI: v = readq(&timer->hpet_config); if ((v & Tn_PER_INT_CAP_MASK) == 0) { err = -ENXIO; break; } devp->hd_flags |= HPET_PERIODIC; break; case HPET_DPI: v = readq(&timer->hpet_config); if ((v & Tn_PER_INT_CAP_MASK) == 0) { err = -ENXIO; break; } if (devp->hd_flags & HPET_PERIODIC && readq(&timer->hpet_config) & Tn_TYPE_CNF_MASK) { v = readq(&timer->hpet_config); v ^= Tn_TYPE_CNF_MASK; writeq(v, &timer->hpet_config); } devp->hd_flags &= ~HPET_PERIODIC; break; case HPET_IRQFREQ: if ((arg > hpet_max_freq) && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; break; } if (!arg) { err = -EINVAL; break; } devp->hd_ireqfreq = hpet_time_div(hpetp, arg); } return err; } static long hpet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hpet_info info; int err; mutex_lock(&hpet_mutex); err = hpet_ioctl_common(file->private_data, cmd, arg, &info); mutex_unlock(&hpet_mutex); if ((cmd == HPET_INFO) && !err && (copy_to_user((void __user *)arg, &info, sizeof(info)))) err = -EFAULT; return err; } #ifdef CONFIG_COMPAT struct compat_hpet_info { compat_ulong_t hi_ireqfreq; /* Hz */ compat_ulong_t hi_flags; /* information */ unsigned short hi_hpet; unsigned short hi_timer; }; /* 32-bit types would lead to different command codes which should be * translated into 64-bit ones before passed to hpet_ioctl_common */ #define COMPAT_HPET_INFO _IOR('h', 0x03, struct compat_hpet_info) #define COMPAT_HPET_IRQFREQ _IOW('h', 0x6, compat_ulong_t) static long hpet_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct hpet_info info; int err; if (cmd == COMPAT_HPET_INFO) cmd = HPET_INFO; if (cmd == COMPAT_HPET_IRQFREQ) cmd = HPET_IRQFREQ; mutex_lock(&hpet_mutex); err = hpet_ioctl_common(file->private_data, cmd, arg, &info); mutex_unlock(&hpet_mutex); if ((cmd == HPET_INFO) && !err) { struct compat_hpet_info __user *u = compat_ptr(arg); if (put_user(info.hi_ireqfreq, &u->hi_ireqfreq) || put_user(info.hi_flags, &u->hi_flags) || put_user(info.hi_hpet, &u->hi_hpet) || put_user(info.hi_timer, &u->hi_timer)) err = -EFAULT; } return err; } #endif static const struct file_operations hpet_fops = { .owner = THIS_MODULE, .read = hpet_read, .poll = hpet_poll, .unlocked_ioctl = hpet_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = hpet_compat_ioctl, #endif .open = hpet_open, .release = hpet_release, .fasync = hpet_fasync, .mmap = hpet_mmap, }; static int hpet_is_known(struct hpet_data *hdp) { struct hpets *hpetp; for (hpetp = hpets; hpetp; hpetp = hpetp->hp_next) if (hpetp->hp_hpet_phys == hdp->hd_phys_address) return 1; return 0; } static const struct ctl_table hpet_table[] = { { .procname = "max-user-freq", .data = &hpet_max_freq, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static struct ctl_table_header *sysctl_header; /* * Adjustment for when arming the timer with * initial conditions. That is, main counter * ticks expired before interrupts are enabled. */ #define TICK_CALIBRATE (1000UL) static unsigned long __hpet_calibrate(struct hpets *hpetp) { struct hpet_timer __iomem *timer = NULL; unsigned long t, m, count, i, flags, start; struct hpet_dev *devp; int j; struct hpet __iomem *hpet; for (j = 0, devp = hpetp->hp_dev; j < hpetp->hp_ntimer; j++, devp++) if ((devp->hd_flags & HPET_OPEN) == 0) { timer = devp->hd_timer; break; } if (!timer) return 0; hpet = hpetp->hp_hpet; t = read_counter(&timer->hpet_compare); i = 0; count = hpet_time_div(hpetp, TICK_CALIBRATE); local_irq_save(flags); start = read_counter(&hpet->hpet_mc); do { m = read_counter(&hpet->hpet_mc); write_counter(t + m + hpetp->hp_delta, &timer->hpet_compare); } while (i++, (m - start) < count); local_irq_restore(flags); return (m - start) / i; } static unsigned long hpet_calibrate(struct hpets *hpetp) { unsigned long ret = ~0UL; unsigned long tmp; /* * Try to calibrate until return value becomes stable small value. * If SMI interruption occurs in calibration loop, the return value * will be big. This avoids its impact. */ for ( ; ; ) { tmp = __hpet_calibrate(hpetp); if (ret <= tmp) break; ret = tmp; } return ret; } int hpet_alloc(struct hpet_data *hdp) { u64 cap, mcfg; struct hpet_dev *devp; u32 i, ntimer; struct hpets *hpetp; struct hpet __iomem *hpet; static struct hpets *last; u32 period; unsigned long long temp; u32 remainder; /* * hpet_alloc can be called by platform dependent code. * If platform dependent code has allocated the hpet that * ACPI has also reported, then we catch it here. */ if (hpet_is_known(hdp)) { printk(KERN_DEBUG "%s: duplicate HPET ignored\n", __func__); return 0; } hpetp = kzalloc(struct_size(hpetp, hp_dev, hdp->hd_nirqs), GFP_KERNEL); if (!hpetp) return -ENOMEM; hpetp->hp_which = hpet_nhpet++; hpetp->hp_hpet = hdp->hd_address; hpetp->hp_hpet_phys = hdp->hd_phys_address; hpetp->hp_ntimer = hdp->hd_nirqs; for (i = 0; i < hdp->hd_nirqs; i++) hpetp->hp_dev[i].hd_hdwirq = hdp->hd_irq[i]; hpet = hpetp->hp_hpet; cap = readq(&hpet->hpet_cap); ntimer = ((cap & HPET_NUM_TIM_CAP_MASK) >> HPET_NUM_TIM_CAP_SHIFT) + 1; if (hpetp->hp_ntimer != ntimer) { printk(KERN_WARNING "hpet: number irqs doesn't agree" " with number of timers\n"); kfree(hpetp); return -ENODEV; } if (last) last->hp_next = hpetp; else hpets = hpetp; last = hpetp; period = (cap & HPET_COUNTER_CLK_PERIOD_MASK) >> HPET_COUNTER_CLK_PERIOD_SHIFT; /* fs, 10^-15 */ temp = 1000000000000000uLL; /* 10^15 femtoseconds per second */ temp += period >> 1; /* round */ do_div(temp, period); hpetp->hp_tick_freq = temp; /* ticks per second */ printk(KERN_INFO "hpet%u: at MMIO 0x%lx, IRQ%s", hpetp->hp_which, hdp->hd_phys_address, str_plural(hpetp->hp_ntimer)); for (i = 0; i < hpetp->hp_ntimer; i++) printk(KERN_CONT "%s %u", i > 0 ? "," : "", hdp->hd_irq[i]); printk(KERN_CONT "\n"); temp = hpetp->hp_tick_freq; remainder = do_div(temp, 1000000); printk(KERN_INFO "hpet%u: %u comparators, %d-bit %u.%06u MHz counter\n", hpetp->hp_which, hpetp->hp_ntimer, cap & HPET_COUNTER_SIZE_MASK ? 64 : 32, (unsigned) temp, remainder); mcfg = readq(&hpet->hpet_config); if ((mcfg & HPET_ENABLE_CNF_MASK) == 0) { write_counter(0L, &hpet->hpet_mc); mcfg |= HPET_ENABLE_CNF_MASK; writeq(mcfg, &hpet->hpet_config); } for (i = 0, devp = hpetp->hp_dev; i < hpetp->hp_ntimer; i++, devp++) { struct hpet_timer __iomem *timer; timer = &hpet->hpet_timers[devp - hpetp->hp_dev]; devp->hd_hpets = hpetp; devp->hd_hpet = hpet; devp->hd_timer = timer; /* * If the timer was reserved by platform code, * then make timer unavailable for opens. */ if (hdp->hd_state & (1 << i)) { devp->hd_flags = HPET_OPEN; continue; } init_waitqueue_head(&devp->hd_waitqueue); } hpetp->hp_delta = hpet_calibrate(hpetp); return 0; } static acpi_status hpet_resources(struct acpi_resource *res, void *data) { struct hpet_data *hdp; acpi_status status; struct acpi_resource_address64 addr; hdp = data; status = acpi_resource_to_address64(res, &addr); if (ACPI_SUCCESS(status)) { hdp->hd_phys_address = addr.address.minimum; hdp->hd_address = ioremap(addr.address.minimum, addr.address.address_length); if (!hdp->hd_address) return AE_ERROR; if (hpet_is_known(hdp)) { iounmap(hdp->hd_address); return AE_ALREADY_EXISTS; } } else if (res->type == ACPI_RESOURCE_TYPE_FIXED_MEMORY32) { struct acpi_resource_fixed_memory32 *fixmem32; fixmem32 = &res->data.fixed_memory32; hdp->hd_phys_address = fixmem32->address; hdp->hd_address = ioremap(fixmem32->address, HPET_RANGE_SIZE); if (!hdp->hd_address) return AE_ERROR; if (hpet_is_known(hdp)) { iounmap(hdp->hd_address); return AE_ALREADY_EXISTS; } } else if (res->type == ACPI_RESOURCE_TYPE_EXTENDED_IRQ) { struct acpi_resource_extended_irq *irqp; int i, irq; irqp = &res->data.extended_irq; for (i = 0; i < irqp->interrupt_count; i++) { if (hdp->hd_nirqs >= HPET_MAX_TIMERS) break; irq = acpi_register_gsi(NULL, irqp->interrupts[i], irqp->triggering, irqp->polarity); if (irq < 0) return AE_ERROR; hdp->hd_irq[hdp->hd_nirqs] = irq; hdp->hd_nirqs++; } } return AE_OK; } static int hpet_acpi_add(struct acpi_device *device) { acpi_status result; struct hpet_data data; memset(&data, 0, sizeof(data)); result = acpi_walk_resources(device->handle, METHOD_NAME__CRS, hpet_resources, &data); if (ACPI_FAILURE(result)) return -ENODEV; if (!data.hd_address || !data.hd_nirqs) { if (data.hd_address) iounmap(data.hd_address); printk("%s: no address or irqs in _CRS\n", __func__); return -ENODEV; } return hpet_alloc(&data); } static const struct acpi_device_id hpet_device_ids[] = { {"PNP0103", 0}, {"", 0}, }; static struct acpi_driver hpet_acpi_driver = { .name = "hpet", .ids = hpet_device_ids, .ops = { .add = hpet_acpi_add, }, }; static struct miscdevice hpet_misc = { HPET_MINOR, "hpet", &hpet_fops }; static int __init hpet_init(void) { int result; result = misc_register(&hpet_misc); if (result < 0) return -ENODEV; sysctl_header = register_sysctl("dev/hpet", hpet_table); result = acpi_bus_register_driver(&hpet_acpi_driver); if (result < 0) { unregister_sysctl_table(sysctl_header); misc_deregister(&hpet_misc); return result; } return 0; } device_initcall(hpet_init); /* MODULE_AUTHOR("Bob Picco <Robert.Picco@hp.com>"); MODULE_LICENSE("GPL"); */
43 42 41 41 2 4 42 43 43 41 1 1 43 2 10 18 13 15 41 41 2 2 2 2 39 40 43 1 2 6 3 3 6 6 6 5 4 5 1 3 6 2 2 1 1 5 5 5 5 3 5 9 9 9 9 1 2 4 9 3 1 2 2 2 2 4 4 4 4 4 4 5 27 27 26 21 21 27 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ipv6.h> #include <net/inet6_hashtables.h> #include <net/addrconf.h> #include "rds.h" #include "loop.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) #define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1) /* converting this to RCU is a chore for another day.. */ static DEFINE_SPINLOCK(rds_conn_lock); static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, const struct in6_addr *faddr) { static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; __be32 lhash, fhash; u32 hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); lhash = laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret); #else fhash = faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } #define rds_conn_info_set(var, test, suffix) do { \ if (test) \ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { if (ipv6_addr_equal(&conn->c_faddr, faddr) && ipv6_addr_equal(&conn->c_laddr, laddr) && conn->c_trans == trans && conn->c_tos == tos && net == rds_conn_net(conn) && conn->c_dev_if == dev_if) { ret = conn; break; } } rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, laddr, faddr); return ret; } /* * This is called by transports as they're bringing down a connection. * It clears partial message state so that the transport can start sending * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ static void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; rdsdebug("connection %pI6c to %pI6c reset\n", &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); cp->cp_flags = 0; /* Do not clear next_rx_seq here, else we cannot distinguish * retransmitted packets from new packets, and will hand all * of them to the application. That is not consistent with the * reliability guarantees of RDS. */ } static void __rds_conn_path_init(struct rds_connection *conn, struct rds_conn_path *cp, bool is_outgoing) { spin_lock_init(&cp->cp_lock); cp->cp_next_tx_seq = 1; init_waitqueue_head(&cp->cp_waitq); INIT_LIST_HEAD(&cp->cp_send_queue); INIT_LIST_HEAD(&cp->cp_retrans); cp->cp_conn = conn; atomic_set(&cp->cp_state, RDS_CONN_DOWN); cp->cp_send_gen = 0; cp->cp_reconnect_jiffies = 0; cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker); INIT_WORK(&cp->cp_down_w, rds_shutdown_worker); mutex_init(&cp->cp_cm_lock); cp->cp_flags = 0; } /* * There is only every one 'conn' for a given pair of addresses in the * system at a time. They contain messages to be retransmitted and so * span the lifetime of the actual underlying transport connections. * * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, gfp_t gfp, u8 tos, int is_outgoing, int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && ipv6_addr_equal(laddr, faddr) && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we * can stick the other QP. */ parent = conn; conn = parent->c_passive; } rcu_read_unlock(); if (conn) goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } conn->c_path = kcalloc(npaths, sizeof(struct rds_conn_path), gfp); if (!conn->c_path) { kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-ENOMEM); goto out; } INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = *laddr; conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; conn->c_tos = tos; #if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the * index used for this connection. Otherwise, set it to 0 as * the socket is not bound to an interface. c_bound_if is used * to look up a socket when a packet is received */ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) conn->c_bound_if = dev_if; else #endif conn->c_bound_if = 0; rds_conn_net_set(conn, net); ret = rds_cong_get_maps(conn); if (ret) { kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } /* * This is where a connection becomes loopback. If *any* RDS sockets * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; if (trans->t_prefer_loopback) { if (likely(is_outgoing)) { /* "outgoing" connection to local address. * Protocol says it wants the connection * handled by the loopback transport. * This is what TCP does. */ trans = &rds_loop_transport; } else { /* No transport currently in use * should end up here, but if it * does, reset/destroy the connection. */ kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(-EOPNOTSUPP); goto out; } } } conn->c_trans = trans; init_waitqueue_head(&conn->c_hs_waitq); for (i = 0; i < npaths; i++) { __rds_conn_path_init(conn, &conn->c_path[i], is_outgoing); conn->c_path[i].cp_index = i; } rcu_read_lock(); if (rds_destroy_pending(conn)) ret = -ENETDOWN; else ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; } rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", conn, laddr, faddr, strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could * have created the same conn (either normal or passive) in the * interim. We check while holding the lock. If we won, we complete * init and return our conn. If we lost, we rollback and return the * other one. */ spin_lock_irqsave(&rds_conn_lock, flags); if (parent) { /* Creating passive conn */ if (parent->c_passive) { trans->conn_free(conn->c_path[0].cp_transport_data); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { parent->c_passive = conn; rds_cong_add_conn(conn); rds_conn_count++; } } else { /* Creating normal conn */ struct rds_connection *found; found = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (found) { struct rds_conn_path *cp; int i; for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; /* The ->conn_alloc invocation may have * allocated resource for all paths, so all * of them may have to be freed here. */ if (cp->cp_transport_data) trans->conn_free(cp->cp_transport_data); } kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { conn->c_my_gen_num = rds_gen_num; conn->c_peer_gen_num = 0; hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } } spin_unlock_irqrestore(&rds_conn_lock, flags); rcu_read_unlock(); out: return conn; } struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if) { return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); void rds_conn_shutdown(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; /* shut it down unless it's down already */ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { /* * Quiesce the connection mgmt handlers before we start tearing * things down. We don't hold the mutex for the entire * duration of the shutdown operation, else we may be * deadlocking with the CM handler. Instead, the CM event * handler is supposed to check for state DISCONNECTING */ mutex_lock(&cp->cp_cm_lock); if (!rds_conn_path_transition(cp, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { rds_conn_path_error(cp, "shutdown called in state %d\n", atomic_read(&cp->cp_state)); mutex_unlock(&cp->cp_cm_lock); return; } mutex_unlock(&cp->cp_cm_lock); wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); wait_event(cp->cp_waitq, !test_bit(RDS_RECV_REFILL, &cp->cp_flags)); conn->c_trans->conn_path_shutdown(cp); rds_conn_path_reset(cp); if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. * Quite reproducible with loopback connections. * Mostly harmless. * * Note that this also happens with rds-tcp because * we could have triggered rds_conn_path_drop in irq * mode from rds_tcp_state change on the receipt of * a FIN, thus we need to recheck for RDS_CONN_ERROR * here. */ rds_conn_path_error(cp, "%s: failed to transition " "to state DOWN, current state " "is %d\n", __func__, atomic_read(&cp->cp_state)); return; } } /* Then reconnect if it's still live. * The passive side of an IB loopback connection is never added * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); rds_queue_reconnect(cp); } else { rcu_read_unlock(); } } /* destroy a single rds_conn_path. rds_conn_destroy() iterates over * all paths using rds_conn_path_destroy() */ static void rds_conn_path_destroy(struct rds_conn_path *cp) { struct rds_message *rm, *rtmp; if (!cp->cp_transport_data) return; /* make sure lingering queued work won't try to ref the conn */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); rds_conn_path_drop(cp, true); flush_work(&cp->cp_down_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, &cp->cp_send_queue, m_conn_item) { list_del_init(&rm->m_conn_item); BUG_ON(!list_empty(&rm->m_sock_item)); rds_message_put(rm); } if (cp->cp_xmit_rm) rds_message_put(cp->cp_xmit_rm); WARN_ON(delayed_work_pending(&cp->cp_send_w)); WARN_ON(delayed_work_pending(&cp->cp_recv_w)); WARN_ON(delayed_work_pending(&cp->cp_conn_w)); WARN_ON(work_pending(&cp->cp_down_w)); cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } /* * Stop and free a connection. * * This can only be used in very limited circumstances. It assumes that once * the conn has been shutdown that no one else is referencing the connection. * We can only ensure this in the rmmod path in the current code. */ void rds_conn_destroy(struct rds_connection *conn) { unsigned long flags; int i; struct rds_conn_path *cp; int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); spin_unlock_irq(&rds_conn_lock); synchronize_rcu(); /* shut the connection down */ for (i = 0; i < npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_destroy(cp); BUG_ON(!list_empty(&cp->cp_retrans)); } /* * The congestion maps aren't freed up here. They're * freed by rds_cong_exit() after all the connections * have been freed. */ rds_cong_remove_conn(conn); kfree(conn->c_path); kmem_cache_free(rds_conn_slab, conn); spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); static void __rds_inc_msg_cp(struct rds_incoming *inc, struct rds_info_iterator *iter, void *saddr, void *daddr, int flip, bool isv6) { #if IS_ENABLED(CONFIG_IPV6) if (isv6) rds6_inc_info_copy(inc, iter, saddr, daddr, flip); else #endif rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip); } static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; struct rds_connection *conn; struct rds_message *rm; unsigned int total = 0; unsigned long flags; size_t i; int j; if (isv6) len /= sizeof(struct rds6_info_message); else len /= sizeof(struct rds_info_message); rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; int npaths; if (!isv6 && conn->c_isv6) continue; npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); for (j = 0; j < npaths; j++) { cp = &conn->c_path[j]; if (want_send) list = &cp->cp_send_queue; else list = &cp->cp_retrans; spin_lock_irqsave(&cp->cp_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { total++; if (total <= len) __rds_inc_msg_cp(&rm->m_inc, iter, &conn->c_laddr, &conn->c_faddr, 0, isv6); } spin_unlock_irqrestore(&cp->cp_lock, flags); } } } rcu_read_unlock(); lens->nr = total; if (isv6) lens->each = sizeof(struct rds6_info_message); else lens->each = sizeof(struct rds_info_message); } static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int want_send) { rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } #endif static void rds_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 1); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 1); } #endif static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds_conn_message_info(sock, len, iter, lens, 0); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { rds6_conn_message_info(sock, len, iter, lens, 0); } #endif void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_conn_path *, void *), u64 *buffer, size_t item_len) { struct hlist_head *head; struct rds_connection *conn; size_t i; rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; /* XXX We only copy the information from the first * path for now. The problem is that if there are * more than one underlying paths, we cannot report * information of all of them using the existing * API. For example, there is only one next_tx_seq, * which path's next_tx_seq should we report? It is * a bug in the design of MPRDS. */ cp = conn->c_path; /* XXX no cp_lock usage.. */ if (!visitor(cp, buffer)) continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller * can resize the buffer. */ if (len >= item_len) { rds_info_copy(iter, buffer, item_len); len -= item_len; } lens->nr++; } } rcu_read_unlock(); } static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; if (conn->c_isv6) return 0; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; cinfo->tos = conn->c_tos; strscpy_pad(cinfo->transport, conn->c_trans->t_name); cinfo->flags = 0; rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); return 1; } #if IS_ENABLED(CONFIG_IPV6) static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds6_info_connection *cinfo6 = buffer; struct rds_connection *conn = cp->cp_conn; cinfo6->next_tx_seq = cp->cp_next_tx_seq; cinfo6->next_rx_seq = cp->cp_next_rx_seq; cinfo6->laddr = conn->c_laddr; cinfo6->faddr = conn->c_faddr; strscpy_pad(cinfo6->transport, conn->c_trans->t_name); cinfo6->flags = 0; rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, CONNECTING); rds_conn_info_set(cinfo6->flags, atomic_read(&cp->cp_state) == RDS_CONN_UP, CONNECTED); /* Just return 1 as there is no error case. This is a helper function * for rds_walk_conn_path_info() and it wants a return value. */ return 1; } #endif static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, buffer, sizeof(struct rds_info_connection)); } #if IS_ENABLED(CONFIG_IPV6) static void rds6_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8]; rds_walk_conn_path_info(sock, len, iter, lens, rds6_conn_info_visitor, buffer, sizeof(struct rds6_info_connection)); } #endif int rds_conn_init(void) { int ret; ret = rds_loop_net_init(); /* register pernet callback */ if (ret) return ret; rds_conn_slab = KMEM_CACHE(rds_connection, 0); if (!rds_conn_slab) { rds_loop_net_exit(); return -ENOMEM; } rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_register_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif return 0; } void rds_conn_exit(void) { rds_loop_net_exit(); /* unregister pernet callback */ rds_loop_exit(); WARN_ON(!hlist_empty(rds_conn_hash)); kmem_cache_destroy(rds_conn_slab); rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_deregister_func(RDS_INFO_SEND_MESSAGES, rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); #if IS_ENABLED(CONFIG_IPV6) rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, rds6_conn_message_info_send); rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, rds6_conn_message_info_retrans); #endif } /* * Force a disconnect */ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } queue_work(rds_wq, &cp->cp_down_w); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); void rds_conn_drop(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_drop(&conn->c_path[0], false); } EXPORT_SYMBOL_GPL(rds_conn_drop); /* * If the connection is down, trigger a connect. We may have scheduled a * delayed reconnect however - in this case we should not interfere. */ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) { rcu_read_lock(); if (rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); /* Check connectivity of all paths */ void rds_check_all_paths(struct rds_connection *conn) { int i = 0; do { rds_conn_path_connect_if_down(&conn->c_path[i]); } while (++i < conn->c_npaths); } void rds_conn_connect_if_down(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); rds_conn_path_connect_if_down(&conn->c_path[0]); } EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); void __rds_conn_path_error(struct rds_conn_path *cp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vprintk(fmt, ap); va_end(ap); rds_conn_path_drop(cp, false); }
11 1 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * Lauro Ramos Venancio <lauro.venancio@openbossa.org> */ #include <linux/nfc.h> #include <linux/module.h> #include "nfc.h" static DEFINE_RWLOCK(proto_tab_lock); static const struct nfc_protocol *proto_tab[NFC_SOCKPROTO_MAX]; static int nfc_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int rc = -EPROTONOSUPPORT; if (net != &init_net) return -EAFNOSUPPORT; if (proto < 0 || proto >= NFC_SOCKPROTO_MAX) return -EINVAL; read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); return rc; } static const struct net_proto_family nfc_sock_family_ops = { .owner = THIS_MODULE, .family = PF_NFC, .create = nfc_sock_create, }; int nfc_proto_register(const struct nfc_protocol *nfc_proto) { int rc; if (nfc_proto->id < 0 || nfc_proto->id >= NFC_SOCKPROTO_MAX) return -EINVAL; rc = proto_register(nfc_proto->proto, 0); if (rc) return rc; write_lock(&proto_tab_lock); if (proto_tab[nfc_proto->id]) rc = -EBUSY; else proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); if (rc) proto_unregister(nfc_proto->proto); return rc; } EXPORT_SYMBOL(nfc_proto_register); void nfc_proto_unregister(const struct nfc_protocol *nfc_proto) { write_lock(&proto_tab_lock); proto_tab[nfc_proto->id] = NULL; write_unlock(&proto_tab_lock); proto_unregister(nfc_proto->proto); } EXPORT_SYMBOL(nfc_proto_unregister); int __init af_nfc_init(void) { return sock_register(&nfc_sock_family_ops); } void __exit af_nfc_exit(void) { sock_unregister(PF_NFC); }
27 25 57 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key * numbers and masks have sufficient capacity. */ #define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { return cpu_feature_enabled(X86_FEATURE_OSPKE); } /* * Try to dedicate one of the protection keys to be used as an * execute-only protection key. */ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey); static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return 0; return __arch_override_mprotect_pkey(vma, prot, pkey); } #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) #define mm_set_pkey_allocated(mm, pkey) do { \ mm_pkey_allocation_map(mm) |= (1U << pkey); \ } while (0) #define mm_set_pkey_free(mm, pkey) do { \ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ } while (0) static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned * from pkey_alloc() or pkey 0 which is allocated * implicitly when the mm is created. */ if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; /* * The exec-only pkey is set in the allocation map, but * is not available to any of the user interfaces like * mprotect_pkey(). */ if (pkey == mm->context.execute_only_pkey) return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } /* * Returns a positive, 4-bit key on success, or -1 on failure. */ static inline int mm_pkey_alloc(struct mm_struct *mm) { /* * Note: this is the one and only place we make sure * that the pkey is valid as far as the hardware is * concerned. The rest of the kernel trusts that * only good, valid pkeys come out of here. */ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); int ret; /* * Are we out of pkeys? We must handle this specially * because ffz() behavior is undefined if there are no * zeros. */ if (mm_pkey_allocation_map(mm) == all_pkeys_mask) return -1; ret = ffz(mm_pkey_allocation_map(mm)); mm_set_pkey_allocated(mm, ret); return ret; } static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; mm_set_pkey_free(mm, pkey); return 0; } static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3; return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; } #endif /*_ASM_X86_PKEYS_H */
26 49556 26 49556 18836 49556 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_IRQFLAGS_H_ #define _X86_IRQFLAGS_H_ #include <asm/processor-flags.h> #ifndef __ASSEMBLER__ #include <asm/nospec-branch.h> /* * Interrupt control: */ /* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */ extern inline unsigned long native_save_fl(void); extern __always_inline unsigned long native_save_fl(void) { unsigned long flags; /* * "=rm" is safe here, because "pop" adjusts the stack before * it evaluates its effective address -- this is part of the * documented behavior of the "pop" instruction. */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" : "=rm" (flags) : /* no input */ : "memory"); return flags; } static __always_inline void native_irq_disable(void) { asm volatile("cli": : :"memory"); } static __always_inline void native_irq_enable(void) { asm volatile("sti": : :"memory"); } static __always_inline void native_safe_halt(void) { x86_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } static __always_inline void native_halt(void) { x86_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); } static __always_inline int native_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); } static __always_inline unsigned long native_local_irq_save(void) { unsigned long flags = native_save_fl(); native_irq_disable(); return flags; } static __always_inline void native_local_irq_restore(unsigned long flags) { if (!native_irqs_disabled_flags(flags)) native_irq_enable(); } #endif #ifndef CONFIG_PARAVIRT #ifndef __ASSEMBLY__ /* * Used in the idle loop; sti takes one instruction cycle * to complete: */ static __always_inline void arch_safe_halt(void) { native_safe_halt(); } /* * Used when interrupts are already enabled or to * shutdown the processor: */ static __always_inline void halt(void) { native_halt(); } #endif /* __ASSEMBLY__ */ #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #ifndef __ASSEMBLER__ #include <linux/types.h> static __always_inline unsigned long arch_local_save_flags(void) { return native_save_fl(); } static __always_inline void arch_local_irq_disable(void) { native_irq_disable(); } static __always_inline void arch_local_irq_enable(void) { native_irq_enable(); } /* * For spinlocks, etc: */ static __always_inline unsigned long arch_local_irq_save(void) { unsigned long flags = arch_local_save_flags(); arch_local_irq_disable(); return flags; } #else #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS pushfq; popq %rax #endif #endif #endif /* __ASSEMBLER__ */ #endif /* CONFIG_PARAVIRT_XXL */ #ifndef __ASSEMBLER__ static __always_inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); } static __always_inline int arch_irqs_disabled(void) { unsigned long flags = arch_local_save_flags(); return arch_irqs_disabled_flags(flags); } static __always_inline void arch_local_irq_restore(unsigned long flags) { if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } #endif /* !__ASSEMBLER__ */ #endif
4 4 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 // SPDX-License-Identifier: GPL-2.0-only /* * The NFC Controller Interface is the communication protocol between an * NFC Controller (NFCC) and a Device Host (DH). * This is the HCI over NCI implementation, as specified in the 10.2 * section of the NCI 1.1 specification. * * Copyright (C) 2014 STMicroelectronics SAS. All rights reserved. */ #include <linux/skbuff.h> #include "../nfc.h" #include <net/nfc/nci.h> #include <net/nfc/nci_core.h> #include <linux/nfc.h> #include <linux/kcov.h> struct nci_data { u8 conn_id; u8 pipe; u8 cmd; const u8 *data; u32 data_len; } __packed; struct nci_hci_create_pipe_params { u8 src_gate; u8 dest_host; u8 dest_gate; } __packed; struct nci_hci_create_pipe_resp { u8 src_host; u8 src_gate; u8 dest_host; u8 dest_gate; u8 pipe; } __packed; struct nci_hci_delete_pipe_noti { u8 pipe; } __packed; struct nci_hci_all_pipe_cleared_noti { u8 host; } __packed; struct nci_hcp_message { u8 header; /* type -cmd,evt,rsp- + instruction */ u8 data[]; } __packed; struct nci_hcp_packet { u8 header; /* cbit+pipe */ struct nci_hcp_message message; } __packed; #define NCI_HCI_ANY_SET_PARAMETER 0x01 #define NCI_HCI_ANY_GET_PARAMETER 0x02 #define NCI_HCI_ANY_CLOSE_PIPE 0x04 #define NCI_HCI_ADM_CLEAR_ALL_PIPE 0x14 #define NCI_HFP_NO_CHAINING 0x80 #define NCI_NFCEE_ID_HCI 0x80 #define NCI_EVT_HOT_PLUG 0x03 #define NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY 0x01 #define NCI_HCI_ADM_CREATE_PIPE 0x10 #define NCI_HCI_ADM_DELETE_PIPE 0x11 /* HCP headers */ #define NCI_HCI_HCP_PACKET_HEADER_LEN 1 #define NCI_HCI_HCP_MESSAGE_HEADER_LEN 1 #define NCI_HCI_HCP_HEADER_LEN 2 /* HCP types */ #define NCI_HCI_HCP_COMMAND 0x00 #define NCI_HCI_HCP_EVENT 0x01 #define NCI_HCI_HCP_RESPONSE 0x02 #define NCI_HCI_ADM_NOTIFY_PIPE_CREATED 0x12 #define NCI_HCI_ADM_NOTIFY_PIPE_DELETED 0x13 #define NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED 0x15 #define NCI_HCI_FRAGMENT 0x7f #define NCI_HCP_HEADER(type, instr) ((((type) & 0x03) << 6) |\ ((instr) & 0x3f)) #define NCI_HCP_MSG_GET_TYPE(header) ((header & 0xc0) >> 6) #define NCI_HCP_MSG_GET_CMD(header) (header & 0x3f) #define NCI_HCP_MSG_GET_PIPE(header) (header & 0x7f) static int nci_hci_result_to_errno(u8 result) { switch (result) { case NCI_HCI_ANY_OK: return 0; case NCI_HCI_ANY_E_REG_PAR_UNKNOWN: return -EOPNOTSUPP; case NCI_HCI_ANY_E_TIMEOUT: return -ETIME; default: return -1; } } /* HCI core */ static void nci_hci_reset_pipes(struct nci_hci_dev *hdev) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { hdev->pipes[i].gate = NCI_HCI_INVALID_GATE; hdev->pipes[i].host = NCI_HCI_INVALID_HOST; } memset(hdev->gate2pipe, NCI_HCI_INVALID_PIPE, sizeof(hdev->gate2pipe)); } static void nci_hci_reset_pipes_per_host(struct nci_dev *ndev, u8 host) { int i; for (i = 0; i < NCI_HCI_MAX_PIPES; i++) { if (ndev->hci_dev->pipes[i].host == host) { ndev->hci_dev->pipes[i].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[i].host = NCI_HCI_INVALID_HOST; } } } /* Fragment HCI data over NCI packet. * NFC Forum NCI 10.2.2 Data Exchange: * The payload of the Data Packets sent on the Logical Connection SHALL be * valid HCP packets, as defined within [ETSI_102622]. Each Data Packet SHALL * contain a single HCP packet. NCI Segmentation and Reassembly SHALL NOT be * applied to Data Messages in either direction. The HCI fragmentation mechanism * is used if required. */ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, const u8 data_type, const u8 *data, size_t data_len) { const struct nci_conn_info *conn_info; struct sk_buff *skb; int len, i, r; u8 cb = pipe; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; i = 0; skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 2); *(u8 *)skb_push(skb, 1) = data_type; do { /* If last packet add NCI_HFP_NO_CHAINING */ if (i + conn_info->max_pkt_payload_len - (skb->len + 1) >= data_len) { cb |= NCI_HFP_NO_CHAINING; len = data_len - i; } else { len = conn_info->max_pkt_payload_len - skb->len - 1; } *(u8 *)skb_push(skb, 1) = cb; if (len > 0) skb_put_data(skb, data + i, len); r = nci_send_data(ndev, conn_info->conn_id, skb); if (r < 0) return r; i += len; if (i < data_len) { skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; skb_reserve(skb, NCI_DATA_HDR_SIZE + 1); } } while (i < data_len); return i; } static void nci_hci_send_data_req(struct nci_dev *ndev, const void *opt) { const struct nci_data *data = opt; nci_hci_send_data(ndev, data->pipe, data->cmd, data->data, data->data_len); } int nci_hci_send_event(struct nci_dev *ndev, u8 gate, u8 event, const u8 *param, size_t param_len) { u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; return nci_hci_send_data(ndev, pipe, NCI_HCP_HEADER(NCI_HCI_HCP_EVENT, event), param, param_len); } EXPORT_SYMBOL(nci_hci_send_event); int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, cmd); data.data = param; data.data_len = param_len; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_send_cmd); int nci_hci_clear_all_pipes(struct nci_dev *ndev) { int r; r = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CLEAR_ALL_PIPE, NULL, 0, NULL); if (r < 0) return r; nci_hci_reset_pipes(ndev->hci_dev); return r; } EXPORT_SYMBOL(nci_hci_clear_all_pipes); static void nci_hci_event_received(struct nci_dev *ndev, u8 pipe, u8 event, struct sk_buff *skb) { if (ndev->ops->hci_event_received) ndev->ops->hci_event_received(ndev, pipe, event, skb); } static void nci_hci_cmd_received(struct nci_dev *ndev, u8 pipe, u8 cmd, struct sk_buff *skb) { u8 gate = ndev->hci_dev->pipes[pipe].gate; u8 status = NCI_HCI_ANY_OK | ~NCI_HCI_FRAGMENT; u8 dest_gate, new_pipe; struct nci_hci_create_pipe_resp *create_info; struct nci_hci_delete_pipe_noti *delete_info; struct nci_hci_all_pipe_cleared_noti *cleared_info; pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd); switch (cmd) { case NCI_HCI_ADM_NOTIFY_PIPE_CREATED: if (skb->len != 5) { status = NCI_HCI_ANY_E_NOK; goto exit; } create_info = (struct nci_hci_create_pipe_resp *)skb->data; dest_gate = create_info->dest_gate; new_pipe = create_info->pipe; if (new_pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we * are the destination and it is our local gate */ ndev->hci_dev->gate2pipe[dest_gate] = new_pipe; ndev->hci_dev->pipes[new_pipe].gate = dest_gate; ndev->hci_dev->pipes[new_pipe].host = create_info->src_host; break; case NCI_HCI_ANY_OPEN_PIPE: /* If the pipe is not created report an error */ if (gate == NCI_HCI_INVALID_GATE) { status = NCI_HCI_ANY_E_NOK; goto exit; } break; case NCI_HCI_ADM_NOTIFY_PIPE_DELETED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } delete_info = (struct nci_hci_delete_pipe_noti *)skb->data; if (delete_info->pipe >= NCI_HCI_MAX_PIPES) { status = NCI_HCI_ANY_E_NOK; goto exit; } ndev->hci_dev->pipes[delete_info->pipe].gate = NCI_HCI_INVALID_GATE; ndev->hci_dev->pipes[delete_info->pipe].host = NCI_HCI_INVALID_HOST; break; case NCI_HCI_ADM_NOTIFY_ALL_PIPE_CLEARED: if (skb->len != 1) { status = NCI_HCI_ANY_E_NOK; goto exit; } cleared_info = (struct nci_hci_all_pipe_cleared_noti *)skb->data; nci_hci_reset_pipes_per_host(ndev, cleared_info->host); break; default: pr_debug("Discarded unknown cmd %x to gate %x\n", cmd, gate); break; } if (ndev->ops->hci_cmd_received) ndev->ops->hci_cmd_received(ndev, pipe, cmd, skb); exit: nci_hci_send_data(ndev, pipe, status, NULL, 0); kfree_skb(skb); } static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct sk_buff *skb) { struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) goto exit; conn_info->rx_skb = skb; exit: nci_req_complete(ndev, NCI_STATUS_OK); } /* Receive hcp message for pipe, with type and cmd. * skb contains optional message data only. */ static void nci_hci_hcp_message_rx(struct nci_dev *ndev, u8 pipe, u8 type, u8 instruction, struct sk_buff *skb) { switch (type) { case NCI_HCI_HCP_RESPONSE: nci_hci_resp_received(ndev, pipe, skb); break; case NCI_HCI_HCP_COMMAND: nci_hci_cmd_received(ndev, pipe, instruction, skb); break; case NCI_HCI_HCP_EVENT: nci_hci_event_received(ndev, pipe, instruction, skb); break; default: pr_err("UNKNOWN MSG Type %d, instruction=%d\n", type, instruction); kfree_skb(skb); break; } nci_req_complete(ndev, NCI_STATUS_OK); } static void nci_hci_msg_rx_work(struct work_struct *work) { struct nci_hci_dev *hdev = container_of(work, struct nci_hci_dev, msg_rx_work); struct sk_buff *skb; const struct nci_hcp_message *message; u8 pipe, type, instruction; for (; (skb = skb_dequeue(&hdev->msg_rx_queue)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]); skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN); message = (struct nci_hcp_message *)skb->data; type = NCI_HCP_MSG_GET_TYPE(message->header); instruction = NCI_HCP_MSG_GET_CMD(message->header); skb_pull(skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); nci_hci_hcp_message_rx(hdev->ndev, pipe, type, instruction, skb); } } void nci_hci_data_received_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; struct nci_hcp_packet *packet; u8 pipe, type; struct sk_buff *hcp_skb; struct sk_buff *frag_skb; int msg_len; if (err) { nci_req_complete(ndev, err); return; } packet = (struct nci_hcp_packet *)skb->data; if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) { skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); return; } /* it's the last fragment. Does it need re-aggregation? */ if (skb_queue_len(&ndev->hci_dev->rx_hcp_frags)) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); msg_len = 0; skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len += (frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN); } hcp_skb = nfc_alloc_recv_skb(NCI_HCI_HCP_PACKET_HEADER_LEN + msg_len, GFP_KERNEL); if (!hcp_skb) { nci_req_complete(ndev, -ENOMEM); return; } skb_put_u8(hcp_skb, pipe); skb_queue_walk(&ndev->hci_dev->rx_hcp_frags, frag_skb) { msg_len = frag_skb->len - NCI_HCI_HCP_PACKET_HEADER_LEN; skb_put_data(hcp_skb, frag_skb->data + NCI_HCI_HCP_PACKET_HEADER_LEN, msg_len); } skb_queue_purge(&ndev->hci_dev->rx_hcp_frags); } else { packet->header &= NCI_HCI_FRAGMENT; hcp_skb = skb; } /* if this is a response, dispatch immediately to * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ packet = (struct nci_hcp_packet *)hcp_skb->data; type = NCI_HCP_MSG_GET_TYPE(packet->message.header); if (type == NCI_HCI_HCP_RESPONSE) { pipe = NCI_HCP_MSG_GET_PIPE(packet->header); skb_pull(hcp_skb, NCI_HCI_HCP_PACKET_HEADER_LEN); nci_hci_hcp_message_rx(ndev, pipe, type, NCI_STATUS_OK, hcp_skb); } else { skb_queue_tail(&ndev->hci_dev->msg_rx_queue, hcp_skb); schedule_work(&ndev->hci_dev->msg_rx_work); } } int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) { struct nci_data data; const struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_OPEN_PIPE); data.data = NULL; data.data_len = 0; return nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); } EXPORT_SYMBOL(nci_hci_open_pipe); static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, int *result) { u8 pipe; struct sk_buff *skb; struct nci_hci_create_pipe_params params; const struct nci_hci_create_pipe_resp *resp; pr_debug("gate=%d\n", dest_gate); params.src_gate = NCI_HCI_ADMIN_GATE; params.dest_host = dest_host; params.dest_gate = dest_gate; *result = nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_CREATE_PIPE, (u8 *)&params, sizeof(params), &skb); if (*result < 0) return NCI_HCI_INVALID_PIPE; resp = (struct nci_hci_create_pipe_resp *)skb->data; pipe = resp->pipe; kfree_skb(skb); pr_debug("pipe created=%d\n", pipe); if (pipe >= NCI_HCI_MAX_PIPES) pipe = NCI_HCI_INVALID_PIPE; return pipe; } static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) { return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 *tmp; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; tmp = kmalloc(1 + param_len, GFP_KERNEL); if (!tmp) return -ENOMEM; *tmp = idx; memcpy(tmp + 1, param, param_len); data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_SET_PARAMETER); data.data = tmp; data.data_len = param_len + 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); } kfree(tmp); return r; } EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { const struct nci_hcp_message *message; const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; pr_debug("idx=%d to gate %d\n", idx, gate); if (pipe == NCI_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; data.conn_id = conn_info->conn_id; data.pipe = pipe; data.cmd = NCI_HCP_HEADER(NCI_HCI_HCP_COMMAND, NCI_HCI_ANY_GET_PARAMETER); data.data = &idx; data.data_len = 1; r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; r = nci_hci_result_to_errno( NCI_HCP_MSG_GET_CMD(message->header)); skb_pull(conn_info->rx_skb, NCI_HCI_HCP_MESSAGE_HEADER_LEN); if (!r && skb) *skb = conn_info->rx_skb; } return r; } EXPORT_SYMBOL(nci_hci_get_param); int nci_hci_connect_gate(struct nci_dev *ndev, u8 dest_host, u8 dest_gate, u8 pipe) { bool pipe_created = false; int r; if (pipe == NCI_HCI_DO_NOT_OPEN_PIPE) return 0; if (ndev->hci_dev->gate2pipe[dest_gate] != NCI_HCI_INVALID_PIPE) return -EADDRINUSE; if (pipe != NCI_HCI_INVALID_PIPE) goto open_pipe; switch (dest_gate) { case NCI_HCI_LINK_MGMT_GATE: pipe = NCI_HCI_LINK_MGMT_PIPE; break; case NCI_HCI_ADMIN_GATE: pipe = NCI_HCI_ADMIN_PIPE; break; default: pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r); if (pipe == NCI_HCI_INVALID_PIPE) return r; pipe_created = true; break; } open_pipe: r = nci_hci_open_pipe(ndev, pipe); if (r < 0) { if (pipe_created) { if (nci_hci_delete_pipe(ndev, pipe) < 0) { /* TODO: Cannot clean by deleting pipe... * -> inconsistent state */ } } return r; } ndev->hci_dev->pipes[pipe].gate = dest_gate; ndev->hci_dev->pipes[pipe].host = dest_host; ndev->hci_dev->gate2pipe[dest_gate] = pipe; return 0; } EXPORT_SYMBOL(nci_hci_connect_gate); static int nci_hci_dev_connect_gates(struct nci_dev *ndev, u8 gate_count, const struct nci_hci_gate *gates) { int r; while (gate_count--) { r = nci_hci_connect_gate(ndev, gates->dest_host, gates->gate, gates->pipe); if (r < 0) return r; gates++; } return 0; } int nci_hci_dev_session_init(struct nci_dev *ndev) { struct nci_conn_info *conn_info; struct sk_buff *skb; int r; ndev->hci_dev->count_pipes = 0; ndev->hci_dev->expected_pipes = 0; conn_info = ndev->hci_dev->conn_info; if (!conn_info) return -EPROTO; conn_info->data_exchange_cb = nci_hci_data_received_cb; conn_info->data_exchange_cb_context = ndev; nci_hci_reset_pipes(ndev->hci_dev); if (ndev->hci_dev->init_data.gates[0].gate != NCI_HCI_ADMIN_GATE) return -EPROTO; r = nci_hci_connect_gate(ndev, ndev->hci_dev->init_data.gates[0].dest_host, ndev->hci_dev->init_data.gates[0].gate, ndev->hci_dev->init_data.gates[0].pipe); if (r < 0) return r; r = nci_hci_get_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, &skb); if (r < 0) return r; if (skb->len && skb->len == strlen(ndev->hci_dev->init_data.session_id) && !memcmp(ndev->hci_dev->init_data.session_id, skb->data, skb->len) && ndev->ops->hci_load_session) { /* Restore gate<->pipe table from some proprietary location. */ r = ndev->ops->hci_load_session(ndev); } else { r = nci_hci_clear_all_pipes(ndev); if (r < 0) goto exit; r = nci_hci_dev_connect_gates(ndev, ndev->hci_dev->init_data.gate_count, ndev->hci_dev->init_data.gates); if (r < 0) goto exit; r = nci_hci_set_param(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADMIN_PARAM_SESSION_IDENTITY, ndev->hci_dev->init_data.session_id, strlen(ndev->hci_dev->init_data.session_id)); } exit: kfree_skb(skb); return r; } EXPORT_SYMBOL(nci_hci_dev_session_init); struct nci_hci_dev *nci_hci_allocate(struct nci_dev *ndev) { struct nci_hci_dev *hdev; hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); if (!hdev) return NULL; skb_queue_head_init(&hdev->rx_hcp_frags); INIT_WORK(&hdev->msg_rx_work, nci_hci_msg_rx_work); skb_queue_head_init(&hdev->msg_rx_queue); hdev->ndev = ndev; return hdev; } void nci_hci_deallocate(struct nci_dev *ndev) { kfree(ndev->hci_dev); }
1 2 25 31 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RDS_RDS_H #define _RDS_RDS_H #include <net/sock.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <rdma/rdma_cm.h> #include <linux/mutex.h> #include <linux/rds.h> #include <linux/rhashtable.h> #include <linux/refcount.h> #include <linux/in6.h> #include "info.h" /* * RDS Network protocol version */ #define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_1 0x0301 #define RDS_PROTOCOL_4_0 0x0400 #define RDS_PROTOCOL_4_1 0x0401 #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) #define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1 /* The following ports, 16385, 18634, 18635, are registered with IANA as * the ports to be used for RDS over TCP and UDP. Currently, only RDS over * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept * to ensure compatibility with older RDS modules. Those ports are defined * in each transport's header file. */ #define RDS_PORT 18634 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ static inline __printf(1, 2) void rdsdebug(char *fmt, ...) { } #endif #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) /* Used to limit both RDMA and non-RDMA RDS message to 1MB */ #define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) struct rds_cong_map { struct rb_node m_rb_node; struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; }; /* * This is how we will track the connection state: * A connection is always in one of the following * states. Updates to the state are atomic and imply * a memory barrier. */ enum { RDS_CONN_DOWN = 0, RDS_CONN_CONNECTING, RDS_CONN_DISCONNECTING, RDS_CONN_UP, RDS_CONN_RESETTING, RDS_CONN_ERROR, }; /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 #define RDS_DESTROY_PENDING 4 /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 #define RDS_MPATH_HASH(rs, n) (jhash_1word(ntohs((rs)->rs_bound_port), \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) /* Per mpath connection state */ struct rds_conn_path { struct rds_connection *cp_conn; struct rds_message *cp_xmit_rm; unsigned long cp_xmit_sg; unsigned int cp_xmit_hdr_off; unsigned int cp_xmit_data_off; unsigned int cp_xmit_atomic_sent; unsigned int cp_xmit_rdma_sent; unsigned int cp_xmit_data_sent; spinlock_t cp_lock; /* protect msg queues */ u64 cp_next_tx_seq; struct list_head cp_send_queue; struct list_head cp_retrans; u64 cp_next_rx_seq; void *cp_transport_data; atomic_t cp_state; unsigned long cp_send_gen; unsigned long cp_flags; unsigned long cp_reconnect_jiffies; struct delayed_work cp_send_w; struct delayed_work cp_recv_w; struct delayed_work cp_conn_w; struct work_struct cp_down_w; struct mutex cp_cm_lock; /* protect cp_state & cm */ wait_queue_head_t cp_waitq; unsigned int cp_unacked_packets; unsigned int cp_unacked_bytes; unsigned int cp_index; }; /* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; struct in6_addr c_laddr; struct in6_addr c_faddr; int c_dev_if; /* ifindex used for this conn */ int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, c_isv6:1, c_ping_triggered:1, c_pad_to_32:29; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; /* Protocol version */ unsigned int c_proposed_version; unsigned int c_version; possible_net_t c_net; /* TOS */ u8 c_tos; struct list_head c_map_item; unsigned long c_map_queued; struct rds_conn_path *c_path; wait_queue_head_t c_hs_waitq; /* handshake waitq */ u32 c_my_gen_num; u32 c_peer_gen_num; }; static inline struct net *rds_conn_net(struct rds_connection *conn) { return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 #define RDS_MAX_ADV_CREDIT 255 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. * Currently the control information that is exchanged is the number of * supported paths. If the peer is a legacy (older kernel revision) peer, * it would return a pong message without additional control information * that would then alert the sender that the peer was an older rev. */ #define RDS_FLAG_PROBE_PORT 1 #define RDS_HS_PROBE(sport, dport) \ ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \ (sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers. */ #define RDS_HEADER_EXT_SPACE 16 struct rds_header { __be64 h_sequence; __be64 h_ack; __be32 h_len; __be16 h_sport; __be16 h_dport; u8 h_flags; u8 h_credit; u8 h_padding[4]; __sum16 h_csum; u8 h_exthdr[RDS_HEADER_EXT_SPACE]; }; /* * Reserved - indicates end of extensions */ #define RDS_EXTHDR_NONE 0 /* * This extension header is included in the very * first message that is sent on a new connection, * and identifies the protocol level. This will help * rolling updates if a future change requires breaking * the protocol. * NB: This is no longer true for IB, where we do a version * negotiation during the connection setup phase (protocol * version information is included in the RDMA CM private data). */ #define RDS_EXTHDR_VERSION 1 struct rds_ext_header_version { __be32 h_version; }; /* * This extension header is included in the RDS message * chasing an RDMA operation. */ #define RDS_EXTHDR_RDMA 2 struct rds_ext_header_rdma { __be32 h_rdma_rkey; }; /* * This extension header tells the peer about the * destination <R_Key,offset> of the requested RDMA * operation. */ #define RDS_EXTHDR_RDMA_DEST 3 struct rds_ext_header_rdma_dest { __be32 h_rdma_rkey; __be32 h_rdma_offset; }; /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 #define RDS_MSG_RX_END 2 #define RDS_MSG_RX_CMSG 3 /* The following values are whitelisted for usercopy */ struct rds_inc_usercopy { rds_rdma_cookie_t rdma_cookie; ktime_t rx_tstamp; }; struct rds_incoming { refcount_t i_refcount; struct list_head i_item; struct rds_connection *i_conn; struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; struct in6_addr i_saddr; struct rds_inc_usercopy i_usercopy; u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { struct rb_node r_rb_node; struct kref r_kref; u32 r_key; /* A copy of the creation flags */ unsigned int r_use_once:1; unsigned int r_invalidate:1; unsigned int r_write:1; struct rds_sock *r_sock; /* back pointer to the socket that owns us */ struct rds_transport *r_trans; void *r_trans_private; }; static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) { return r_key | (((u64) offset) << 32); } static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) { return cookie; } static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) { return cookie >> 32; } /* atomic operation types */ #define RDS_ATOMIC_TYPE_CSWP 0 #define RDS_ATOMIC_TYPE_FADD 1 /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty * the message will not be put back on the retransmit list after being sent. * messages that are canceled while being sent rely on this. * * m_inc is used by loopback so that it can pass an incoming message straight * back up into the rx path. It embeds a wire header which is also used by * the send path, which is kind of awkward. * * m_sock_item indicates the message's presence on a socket's send or receive * queue. m_rs will point to that socket. * * m_daddr is used by cancellation to prune messages to a given destination. * * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock * nesting. As paths iterate over messages on a sock, or conn, they must * also lock the conn, or sock, to remove the message from those lists too. * Testing the flag to determine if the message is still on the lists lets * us avoid testing the list_head directly. That means each path can use * the message's list_head to keep it on a local list while juggling locks * without confusing the other path. * * m_ack_seq is an optional field set by transports who need a different * sequence number range to invalidate. They can use this in a callback * that they pass to rds_send_drop_acked() to see if each message has been * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't * had ack_seq set yet. */ #define RDS_MSG_ON_SOCK 1 #define RDS_MSG_ON_CONN 2 #define RDS_MSG_HAS_ACK_SEQ 3 #define RDS_MSG_ACK_REQUIRED 4 #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 struct rds_znotifier { struct mmpin z_mmp; u32 z_cookie; }; struct rds_msg_zcopy_info { struct list_head rs_zcookie_next; union { struct rds_znotifier znotif; struct rds_zcopy_cookies zcookies; }; }; struct rds_msg_zcopy_queue { struct list_head zcookie_head; spinlock_t lock; /* protects zcookie_head queue */ }; static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) { spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->zcookie_head); } struct rds_iov_vector { struct rds_iovec *iov; int len; }; struct rds_iov_vector_arr { struct rds_iov_vector *vec; int len; int indx; int incr; }; struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. * Lock nesting is * rm->m_rs_lock * -> rs->rs_lock */ spinlock_t m_rs_lock; wait_queue_head_t m_flush_wait; struct rds_sock *m_rs; /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; unsigned int m_used_sgs; unsigned int m_total_sgs; void *m_final_op; struct { struct rm_atomic_op { int op_type; union { struct { uint64_t compare; uint64_t swap; uint64_t compare_mask; uint64_t swap_mask; } op_m_cswp; struct { uint64_t add; uint64_t nocarry_mask; } op_m_fadd; }; u32 op_rkey; u64 op_remote_addr; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; } atomic; struct rm_rdma_op { u32 op_rkey; u64 op_remote_addr; unsigned int op_write:1; unsigned int op_fence:1; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; unsigned int op_bytes; unsigned int op_nents; unsigned int op_count; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; u64 op_odp_addr; struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; struct rds_conn_path *m_conn_path; }; /* * The RDS notifier is used (optionally) to tell the application about * completed RDMA operations. Rather than keeping the whole rds message * around on the queue, we allocate a small notifier that is put on the * socket's notifier_list. Notifications are delivered to the application * through control messages. */ struct rds_notifier { struct list_head n_list; uint64_t n_user_token; int n_status; }; /* Available as part of RDS core, so doesn't need to participate * in get_preferred transport etc */ #define RDS_TRANS_LOOP 3 /** * struct rds_transport - transport specific behavioural hooks * * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send * part of a message. The caller serializes on the send_sem so this * doesn't need to be reentrant for a given conn. The header must be * sent before the data payload. .xmit must be prepared to send a * message with no data payload. .xmit should return the number of * bytes that were sent down the connection, including header bytes. * Returning 0 tells the caller that it doesn't need to perform any * additional work now. This is usually the case when the transport has * filled the sending queue for its connection and will handle * triggering the rds thread to continue the send when space becomes * available. Returning -EAGAIN tells the caller to retry the send * immediately. Returning -ENOMEM tells the caller to retry the send at * some point in the future. * * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once * it returns the connection can not call rds_recv_incoming(). * This will only be called once after conn_connect returns * non-zero success and will The caller serializes this with * the send and connecting paths (xmit_* and conn_*). The * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. */ struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; struct module *t_owner; unsigned int t_prefer_loopback:1, t_mp_capable:1; unsigned int t_type; int (*laddr_check)(struct net *net, const struct in6_addr *addr, __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_path_connect)(struct rds_conn_path *cp); void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_path_complete)(struct rds_conn_path *cp); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv_path)(struct rds_conn_path *cp); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6); int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); void (*cm_connect_complete)(struct rds_connection *conn, struct rdma_cm_event *event); unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, unsigned int avail); void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); bool (*t_unloading)(struct rds_connection *conn); u8 (*get_tos_map)(u8 tos); }; /* Bind hash table key length. It is the sum of the size of a struct * in6_addr, a scope_id and a port. */ #define RDS_BOUND_KEY_LEN \ (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16)) struct rds_sock { struct sock rs_sk; u64 rs_user_addr; u64 rs_user_bytes; /* * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ struct rhash_head rs_bound_node; u8 rs_bound_key[RDS_BOUND_KEY_LEN]; struct sockaddr_in6 rs_bound_sin6; #define rs_bound_addr rs_bound_sin6.sin6_addr #define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] #define rs_bound_port rs_bound_sin6.sin6_port #define rs_bound_scope_id rs_bound_sin6.sin6_scope_id struct in6_addr rs_conn_addr; #define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] __be16 rs_conn_port; struct rds_transport *rs_transport; /* * rds_sendmsg caches the conn it used the last time around. * This helps avoid costly lookups. */ struct rds_connection *rs_conn; /* flag indicating we were congested or not */ int rs_congested; /* seen congestion (ENOBUFS) when sending? */ int rs_seen_congestion; /* rs_lock protects all these adjacent members before the newline */ spinlock_t rs_lock; struct list_head rs_send_queue; u32 rs_snd_bytes; int rs_rcv_bytes; struct list_head rs_notify_queue; /* currently used for failed RDMAs */ /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask * to decide whether the application should be woken up. * If not set, we use rs_cong_track to find out whether a cong map * update arrived. */ uint64_t rs_cong_mask; uint64_t rs_cong_notify; struct list_head rs_cong_list; unsigned long rs_cong_track; /* * rs_recv_lock protects the receive queue, and is * used to serialize with rds_release. */ rwlock_t rs_recv_lock; struct list_head rs_recv_queue; /* just for stats reporting */ struct list_head rs_item; /* these have their own lock */ spinlock_t rs_rdma_lock; struct rb_root rs_rdma_keys; /* Socket options - in case there will be more */ unsigned char rs_recverr, rs_cong_monitor; u32 rs_hash_initval; /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; struct rds_msg_zcopy_queue rs_zcookie_queue; u8 rs_tos; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) { return container_of(sk, struct rds_sock, rs_sk); } static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) { return &rs->rs_sk; } /* * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value * to account for overhead. We don't account for overhead, we just apply * the number of payload bytes to the specified value. */ static inline int rds_sk_sndbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_sndbuf / 2; } static inline int rds_sk_rcvbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_rcvbuf / 2; } struct rds_statistics { uint64_t s_conn_reset; uint64_t s_recv_drop_bad_checksum; uint64_t s_recv_drop_old_seq; uint64_t s_recv_drop_no_sock; uint64_t s_recv_drop_dead_sock; uint64_t s_recv_deliver_raced; uint64_t s_recv_delivered; uint64_t s_recv_queued; uint64_t s_recv_immediate_retry; uint64_t s_recv_delayed_retry; uint64_t s_recv_ack_required; uint64_t s_recv_rdma_bytes; uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; uint64_t s_send_lock_contention; uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; uint64_t s_send_ack_required; uint64_t s_send_queued; uint64_t s_send_rdma; uint64_t s_send_rdma_bytes; uint64_t s_send_pong; uint64_t s_page_remainder_hit; uint64_t s_page_remainder_miss; uint64_t s_copy_to_user; uint64_t s_copy_from_user; uint64_t s_cong_update_queued; uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; uint64_t s_recv_bytes_added_to_socket; uint64_t s_recv_bytes_removed_from_socket; uint64_t s_send_stuck_rm; }; /* af_rds.c */ void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); static inline void __rds_wake_sk_sleep(struct sock *sk) { wait_queue_head_t *waitq = sk_sleep(sk); if (!sock_flag(sk, SOCK_DEAD) && waitq) wake_up(waitq); } extern wait_queue_head_t rds_poll_waitq; /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id); int rds_bind_lock_init(void); void rds_bind_lock_destroy(void); /* cong.c */ int rds_cong_get_maps(struct rds_connection *conn); void rds_cong_add_conn(struct rds_connection *conn); void rds_cong_remove_conn(struct rds_connection *conn); void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); void rds_cong_queue_updates(struct rds_cong_map *map); void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); int rds_cong_updated_since(unsigned long *recent); void rds_cong_add_socket(struct rds_sock *); void rds_cong_remove_socket(struct rds_sock *); void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* connection.c */ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_path_connect_if_down(struct rds_conn_path *cp); void rds_check_all_paths(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len); __printf(2, 3) void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); #define rds_conn_path_error(cp, fmt...) \ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) static inline int rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) { return atomic_cmpxchg(&cp->cp_state, old, new) == old; } static inline int rds_conn_transition(struct rds_connection *conn, int old, int new) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_transition(&conn->c_path[0], old, new); } static inline int rds_conn_path_state(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state); } static inline int rds_conn_state(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_state(&conn->c_path[0]); } static inline int rds_conn_path_up(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_UP; } static inline int rds_conn_path_down(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_DOWN; } static inline int rds_conn_up(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_up(&conn->c_path[0]); } static inline int rds_conn_path_connecting(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING; } static inline int rds_conn_connecting(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_connecting(&conn->c_path[0]); } /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); void rds_message_unmapped(struct rds_message *rm); void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info); static inline void rds_message_make_checksum(struct rds_header *hdr) { hdr->h_csum = 0; hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); } static inline int rds_message_verify_checksum(const struct rds_header *hdr) { return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; } /* page.c */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp); void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, struct in6_addr *saddr); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); void rds_clear_recv_queue(struct rds_sock *rs); int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); void rds6_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, struct in6_addr *saddr, struct in6_addr *daddr, int flip); /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_path_reset(struct rds_conn_path *conn); int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked); void rds_send_ping(struct rds_connection *conn, int cp_index); int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov); int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec); int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void rds_rdma_free_op(struct rm_rdma_op *ro); void rds_atomic_free_op(struct rm_atomic_op *ao); void rds_rdma_send_complete(struct rds_message *rm, int wc_status); void rds_atomic_send_complete(struct rds_message *rm, int wc_status); int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void __rds_put_mr_final(struct kref *kref); static inline bool rds_destroy_pending(struct rds_connection *conn) { return !check_net(rds_conn_net(conn)) || (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } enum { ODP_NOT_NEEDED, ODP_ZEROBASED, ODP_VIRTUAL }; /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ per_cpu(which, get_cpu()).member++; \ put_cpu(); \ } while (0) #define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) #define rds_stats_add_which(which, member, count) do { \ per_cpu(which, get_cpu()).member += count; \ put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr); /* sysctl.c */ int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; extern unsigned long rds_sysctl_sndbuf_max; extern unsigned long rds_sysctl_reconnect_min_jiffies; extern unsigned long rds_sysctl_reconnect_max_jiffies; extern unsigned int rds_sysctl_max_unacked_packets; extern unsigned int rds_sysctl_max_unacked_bytes; extern unsigned int rds_sysctl_ping_enable; extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; void rds_queue_reconnect(struct rds_conn_path *cp); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); void rds_connect_path_complete(struct rds_c