| 1 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer device management * Copyright (c) 1999 by Takashi Iwai <tiwai@suse.de> * *---------------------------------------------------------------- * * This device handler separates the card driver module from sequencer * stuff (sequencer core, synth drivers, etc), so that user can avoid * to spend unnecessary resources e.g. if he needs only listening to * MP3s. * * The card (or lowlevel) driver creates a sequencer device entry * via snd_seq_device_new(). This is an entry pointer to communicate * with the sequencer device "driver", which is involved with the * actual part to communicate with the sequencer core. * Each sequencer device entry has an id string and the corresponding * driver with the same id is loaded when required. For example, * lowlevel codes to access emu8000 chip on sbawe card are included in * emu8000-synth module. To activate this module, the hardware * resources like i/o port are passed via snd_seq_device argument. */ #include <linux/device.h> #include <linux/init.h> #include <linux/module.h> #include <sound/core.h> #include <sound/info.h> #include <sound/seq_device.h> #include <sound/seq_kernel.h> #include <sound/initval.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/mutex.h> MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("ALSA sequencer device management"); MODULE_LICENSE("GPL"); /* * bus definition */ static int snd_seq_bus_match(struct device *dev, const struct device_driver *drv) { struct snd_seq_device *sdev = to_seq_dev(dev); const struct snd_seq_driver *sdrv = to_seq_drv(drv); return strcmp(sdrv->id, sdev->id) == 0 && sdrv->argsize == sdev->argsize; } static const struct bus_type snd_seq_bus_type = { .name = "snd_seq", .match = snd_seq_bus_match, }; /* * proc interface -- just for compatibility */ #ifdef CONFIG_SND_PROC_FS static struct snd_info_entry *info_entry; static int print_dev_info(struct device *dev, void *data) { struct snd_seq_device *sdev = to_seq_dev(dev); struct snd_info_buffer *buffer = data; snd_iprintf(buffer, "snd-%s,%s,%d\n", sdev->id, dev->driver ? "loaded" : "empty", dev->driver ? 1 : 0); return 0; } static void snd_seq_device_info(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { bus_for_each_dev(&snd_seq_bus_type, NULL, buffer, print_dev_info); } #endif /* * load all registered drivers (called from seq_clientmgr.c) */ #ifdef CONFIG_MODULES /* flag to block auto-loading */ static atomic_t snd_seq_in_init = ATOMIC_INIT(1); /* blocked as default */ static int request_seq_drv(struct device *dev, void *data) { struct snd_seq_device *sdev = to_seq_dev(dev); if (!dev->driver) request_module("snd-%s", sdev->id); return 0; } static void autoload_drivers(struct work_struct *work) { /* avoid reentrance */ if (atomic_inc_return(&snd_seq_in_init) == 1) bus_for_each_dev(&snd_seq_bus_type, NULL, NULL, request_seq_drv); atomic_dec(&snd_seq_in_init); } static DECLARE_WORK(autoload_work, autoload_drivers); static void queue_autoload_drivers(void) { schedule_work(&autoload_work); } void snd_seq_autoload_init(void) { atomic_dec(&snd_seq_in_init); #ifdef CONFIG_SND_SEQUENCER_MODULE /* initial autoload only when snd-seq is a module */ queue_autoload_drivers(); #endif } EXPORT_SYMBOL(snd_seq_autoload_init); void snd_seq_autoload_exit(void) { atomic_inc(&snd_seq_in_init); } EXPORT_SYMBOL(snd_seq_autoload_exit); void snd_seq_device_load_drivers(void) { queue_autoload_drivers(); flush_work(&autoload_work); } EXPORT_SYMBOL(snd_seq_device_load_drivers); static inline void cancel_autoload_drivers(void) { cancel_work_sync(&autoload_work); } #else static inline void queue_autoload_drivers(void) { } static inline void cancel_autoload_drivers(void) { } #endif /* * device management */ static int snd_seq_device_dev_free(struct snd_device *device) { struct snd_seq_device *dev = device->device_data; cancel_autoload_drivers(); if (dev->private_free) dev->private_free(dev); put_device(&dev->dev); return 0; } static int snd_seq_device_dev_register(struct snd_device *device) { struct snd_seq_device *dev = device->device_data; int err; err = device_add(&dev->dev); if (err < 0) return err; if (!dev->dev.driver) queue_autoload_drivers(); return 0; } static int snd_seq_device_dev_disconnect(struct snd_device *device) { struct snd_seq_device *dev = device->device_data; device_del(&dev->dev); return 0; } static void snd_seq_dev_release(struct device *dev) { kfree(to_seq_dev(dev)); } /* * register a sequencer device * card = card info * device = device number (if any) * id = id of driver * result = return pointer (NULL allowed if unnecessary) */ int snd_seq_device_new(struct snd_card *card, int device, const char *id, int argsize, struct snd_seq_device **result) { struct snd_seq_device *dev; int err; static const struct snd_device_ops dops = { .dev_free = snd_seq_device_dev_free, .dev_register = snd_seq_device_dev_register, .dev_disconnect = snd_seq_device_dev_disconnect, }; if (result) *result = NULL; if (snd_BUG_ON(!id)) return -EINVAL; dev = kzalloc(sizeof(*dev) + argsize, GFP_KERNEL); if (!dev) return -ENOMEM; /* set up device info */ dev->card = card; dev->device = device; dev->id = id; dev->argsize = argsize; device_initialize(&dev->dev); dev->dev.parent = &card->card_dev; dev->dev.bus = &snd_seq_bus_type; dev->dev.release = snd_seq_dev_release; dev_set_name(&dev->dev, "%s-%d-%d", dev->id, card->number, device); /* add this device to the list */ err = snd_device_new(card, SNDRV_DEV_SEQUENCER, dev, &dops); if (err < 0) { put_device(&dev->dev); return err; } if (result) *result = dev; return 0; } EXPORT_SYMBOL(snd_seq_device_new); /* * driver registration */ int __snd_seq_driver_register(struct snd_seq_driver *drv, struct module *mod) { if (WARN_ON(!drv->driver.name || !drv->id)) return -EINVAL; drv->driver.bus = &snd_seq_bus_type; drv->driver.owner = mod; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__snd_seq_driver_register); void snd_seq_driver_unregister(struct snd_seq_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(snd_seq_driver_unregister); /* * module part */ static int __init seq_dev_proc_init(void) { #ifdef CONFIG_SND_PROC_FS info_entry = snd_info_create_module_entry(THIS_MODULE, "drivers", snd_seq_root); if (info_entry == NULL) return -ENOMEM; info_entry->content = SNDRV_INFO_CONTENT_TEXT; info_entry->c.text.read = snd_seq_device_info; if (snd_info_register(info_entry) < 0) { snd_info_free_entry(info_entry); return -ENOMEM; } #endif return 0; } static int __init alsa_seq_device_init(void) { int err; err = bus_register(&snd_seq_bus_type); if (err < 0) return err; err = seq_dev_proc_init(); if (err < 0) bus_unregister(&snd_seq_bus_type); return err; } static void __exit alsa_seq_device_exit(void) { #ifdef CONFIG_MODULES cancel_work_sync(&autoload_work); #endif #ifdef CONFIG_SND_PROC_FS snd_info_free_entry(info_entry); #endif bus_unregister(&snd_seq_bus_type); } subsys_initcall(alsa_seq_device_init) module_exit(alsa_seq_device_exit) |
| 6 97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | // SPDX-License-Identifier: GPL-2.0+ #include <drm/drm_atomic_helper.h> #include <drm/drm_edid.h> #include <drm/drm_managed.h> #include <drm/drm_probe_helper.h> #include "vkms_connector.h" static const struct drm_connector_funcs vkms_connector_funcs = { .fill_modes = drm_helper_probe_single_connector_modes, .reset = drm_atomic_helper_connector_reset, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; static int vkms_conn_get_modes(struct drm_connector *connector) { int count; /* Use the default modes list from DRM */ count = drm_add_modes_noedid(connector, XRES_MAX, YRES_MAX); drm_set_preferred_mode(connector, XRES_DEF, YRES_DEF); return count; } static struct drm_encoder *vkms_conn_best_encoder(struct drm_connector *connector) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) return encoder; return NULL; } static const struct drm_connector_helper_funcs vkms_conn_helper_funcs = { .get_modes = vkms_conn_get_modes, .best_encoder = vkms_conn_best_encoder, }; struct vkms_connector *vkms_connector_init(struct vkms_device *vkmsdev) { struct drm_device *dev = &vkmsdev->drm; struct vkms_connector *connector; int ret; connector = drmm_kzalloc(dev, sizeof(*connector), GFP_KERNEL); if (!connector) return ERR_PTR(-ENOMEM); ret = drmm_connector_init(dev, &connector->base, &vkms_connector_funcs, DRM_MODE_CONNECTOR_VIRTUAL, NULL); if (ret) return ERR_PTR(ret); drm_connector_helper_add(&connector->base, &vkms_conn_helper_funcs); return connector; } |
| 609 3 2 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 1 3 2 1 1 1 609 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 | // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> * (C) 2011 Intra2net AG <https://www.intra2net.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netns/generic.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); struct nf_acct { atomic64_t pkts; atomic64_t bytes; unsigned long flags; struct list_head head; refcount_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; char data[]; }; struct nfacct_filter { u32 value; u32 mask; }; struct nfnl_acct_net { struct list_head nfnl_acct_list; }; static unsigned int nfnl_acct_net_id __read_mostly; static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net) { return net_generic(net, nfnl_acct_net_id); } #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *nfacct, *matching = NULL; unsigned int size = 0; char *acct_name; u32 flags = 0; if (!tb[NFACCT_NAME]) return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); if (strlen(acct_name) == 0) return -EINVAL; list_for_each_entry(nfacct, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = nfacct; break; } if (matching) { if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); smp_mb__before_atomic(); /* reset overquota flag if quota is enabled. */ if ((matching->flags & NFACCT_F_QUOTA)) clear_bit(NFACCT_OVERQUOTA_BIT, &matching->flags); return 0; } return -EBUSY; } if (tb[NFACCT_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFACCT_FLAGS])); if (flags & ~NFACCT_F_QUOTA) return -EOPNOTSUPP; if ((flags & NFACCT_F_QUOTA) == NFACCT_F_QUOTA) return -EINVAL; if (flags & NFACCT_F_OVERQUOTA) return -EINVAL; if ((flags & NFACCT_F_QUOTA) && !tb[NFACCT_QUOTA]) return -EINVAL; size += sizeof(u64); } nfacct = kzalloc(sizeof(struct nf_acct) + size, GFP_KERNEL); if (nfacct == NULL) return -ENOMEM; if (flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)nfacct->data; *quota = be64_to_cpu(nla_get_be64(tb[NFACCT_QUOTA])); nfacct->flags = flags; } nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, be64_to_cpu(nla_get_be64(tb[NFACCT_BYTES]))); } if (tb[NFACCT_PKTS]) { atomic64_set(&nfacct->pkts, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } refcount_set(&nfacct->refcnt, 1); list_add_tail_rcu(&nfacct->head, &nfnl_acct_net->nfnl_acct_list); return 0; } static int nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_acct *acct) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; u32 old_flags; event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, NFACCT_NAME, acct->name)) goto nla_put_failure; old_flags = acct->flags; if (type == NFNL_MSG_ACCT_GET_CTRZERO) { pkts = atomic64_xchg(&acct->pkts, 0); bytes = atomic64_xchg(&acct->bytes, 0); smp_mb__before_atomic(); if (acct->flags & NFACCT_F_QUOTA) clear_bit(NFACCT_OVERQUOTA_BIT, &acct->flags); } else { pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); } if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts), NFACCT_PAD) || nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes), NFACCT_PAD) || nla_put_be32(skb, NFACCT_USE, htonl(refcount_read(&acct->refcnt)))) goto nla_put_failure; if (acct->flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)acct->data; if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) || nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota), NFACCT_PAD)) goto nla_put_failure; } nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; if (cb->args[2]) return 0; last = (struct nf_acct *)cb->args[1]; if (cb->args[1]) cb->args[1] = 0; rcu_read_lock(); list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; last = NULL; } if (filter && (cur->flags & filter->mask) != filter->value) continue; if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), NFNL_MSG_ACCT_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; break; } } if (!cb->args[1]) cb->args[2] = 1; rcu_read_unlock(); return skb->len; } static int nfnl_acct_done(struct netlink_callback *cb) { kfree(cb->data); return 0; } static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = { [NFACCT_FILTER_MASK] = { .type = NLA_U32 }, [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, }; static int nfnl_acct_start(struct netlink_callback *cb) { const struct nlattr *const attr = cb->data; struct nlattr *tb[NFACCT_FILTER_MAX + 1]; struct nfacct_filter *filter; int err; if (!attr) return 0; err = nla_parse_nested_deprecated(tb, NFACCT_FILTER_MAX, attr, filter_policy, NULL); if (err < 0) return err; if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE]) return -EINVAL; filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); if (!filter) return -ENOMEM; filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); cb->data = filter; return 0; } static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, .start = nfnl_acct_start, .done = nfnl_acct_done, .data = (void *)tb[NFACCT_FILTER], }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!tb[NFACCT_NAME]) return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_ACCT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } /* try to delete object, fail if it is still in use. */ static int nfnl_acct_try_del(struct nf_acct *cur) { int ret = 0; /* We want to avoid races with nfnl_acct_put. So only when the current * refcnt is 1, we decrease it to 0. */ if (refcount_dec_if_one(&cur->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&cur->head); kfree_rcu(cur, rcu_head); } else { ret = -EBUSY; } return ret; } static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *cur, *tmp; int ret = -ENOENT; char *acct_name; if (!tb[NFACCT_NAME]) { list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); list_for_each_entry(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; ret = nfnl_acct_try_del(cur); if (ret < 0) return ret; break; } return ret; } static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, [NFACCT_BYTES] = { .type = NLA_U64 }, [NFACCT_PKTS] = { .type = NLA_U64 }, [NFACCT_FLAGS] = { .type = NLA_U32 }, [NFACCT_QUOTA] = { .type = NLA_U64 }, [NFACCT_FILTER] = {.type = NLA_NESTED }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, .type = NFNL_CB_MUTEX, .attr_count = NFACCT_MAX, .policy = nfnl_acct_policy }, }; static const struct nfnetlink_subsystem nfnl_acct_subsys = { .name = "acct", .subsys_id = NFNL_SUBSYS_ACCT, .cb_count = NFNL_MSG_ACCT_MAX, .cb = nfnl_acct_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *acct = NULL; rcu_read_lock(); list_for_each_entry_rcu(cur, &nfnl_acct_net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; if (!try_module_get(THIS_MODULE)) goto err; if (!refcount_inc_not_zero(&cur->refcnt)) { module_put(THIS_MODULE); goto err; } acct = cur; break; } err: rcu_read_unlock(); return acct; } EXPORT_SYMBOL_GPL(nfnl_acct_find_get); void nfnl_acct_put(struct nf_acct *acct) { if (refcount_dec_and_test(&acct->refcnt)) kfree_rcu(acct, rcu_head); module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(nfnl_acct_put); void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) { atomic64_inc(&nfacct->pkts); atomic64_add(skb->len, &nfacct->bytes); } EXPORT_SYMBOL_GPL(nfnl_acct_update); static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) { int ret; struct sk_buff *skb; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (skb == NULL) return; ret = nfnl_acct_fill_info(skb, 0, 0, NFNL_MSG_ACCT_OVERQUOTA, 0, nfacct); if (ret <= 0) { kfree_skb(skb); return; } nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC); } int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct) { u64 now; u64 *quota; int ret = NFACCT_UNDERQUOTA; /* no place here if we don't have a quota */ if (!(nfacct->flags & NFACCT_F_QUOTA)) return NFACCT_NO_QUOTA; quota = (u64 *)nfacct->data; now = (nfacct->flags & NFACCT_F_QUOTA_PKTS) ? atomic64_read(&nfacct->pkts) : atomic64_read(&nfacct->bytes); ret = now > *quota; if (now >= *quota && !test_and_set_bit(NFACCT_OVERQUOTA_BIT, &nfacct->flags)) { nfnl_overquota_report(net, nfacct); } return ret; } EXPORT_SYMBOL_GPL(nfnl_acct_overquota); static int __net_init nfnl_acct_net_init(struct net *net) { INIT_LIST_HEAD(&nfnl_acct_pernet(net)->nfnl_acct_list); return 0; } static void __net_exit nfnl_acct_net_exit(struct net *net) { struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); struct nf_acct *cur, *tmp; list_for_each_entry_safe(cur, tmp, &nfnl_acct_net->nfnl_acct_list, head) { list_del_rcu(&cur->head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, .id = &nfnl_acct_net_id, .size = sizeof(struct nfnl_acct_net), }; static int __init nfnl_acct_init(void) { int ret; ret = register_pernet_subsys(&nfnl_acct_ops); if (ret < 0) { pr_err("nfnl_acct_init: failed to register pernet ops\n"); goto err_out; } ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); goto cleanup_pernet; } return 0; cleanup_pernet: unregister_pernet_subsys(&nfnl_acct_ops); err_out: return ret; } static void __exit nfnl_acct_exit(void) { nfnetlink_subsys_unregister(&nfnl_acct_subsys); unregister_pernet_subsys(&nfnl_acct_ops); } module_init(nfnl_acct_init); module_exit(nfnl_acct_exit); |
| 3 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | // SPDX-License-Identifier: GPL-2.0 /* * Root Complex Event Collector Support * * Authors: * Sean V Kelley <sean.v.kelley@intel.com> * Qiuxu Zhuo <qiuxu.zhuo@intel.com> * * Copyright (C) 2020 Intel Corp. */ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/pci_regs.h> #include "../pci.h" struct walk_rcec_data { struct pci_dev *rcec; int (*user_callback)(struct pci_dev *dev, void *data); void *user_data; }; static bool rcec_assoc_rciep(struct pci_dev *rcec, struct pci_dev *rciep) { unsigned long bitmap = rcec->rcec_ea->bitmap; unsigned int devn; /* An RCiEP found on a different bus in range */ if (rcec->bus->number != rciep->bus->number) return true; /* Same bus, so check bitmap */ for_each_set_bit(devn, &bitmap, 32) if (devn == PCI_SLOT(rciep->devfn)) return true; return false; } static int link_rcec_helper(struct pci_dev *dev, void *data) { struct walk_rcec_data *rcec_data = data; struct pci_dev *rcec = rcec_data->rcec; if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) && rcec_assoc_rciep(rcec, dev)) { dev->rcec = rcec; pci_dbg(dev, "PME & error events signaled via %s\n", pci_name(rcec)); } return 0; } static int walk_rcec_helper(struct pci_dev *dev, void *data) { struct walk_rcec_data *rcec_data = data; struct pci_dev *rcec = rcec_data->rcec; if ((pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) && rcec_assoc_rciep(rcec, dev)) rcec_data->user_callback(dev, rcec_data->user_data); return 0; } static void walk_rcec(int (*cb)(struct pci_dev *dev, void *data), void *userdata) { struct walk_rcec_data *rcec_data = userdata; struct pci_dev *rcec = rcec_data->rcec; u8 nextbusn, lastbusn; struct pci_bus *bus; unsigned int bnr; if (!rcec->rcec_ea) return; /* Walk own bus for bitmap based association */ pci_walk_bus(rcec->bus, cb, rcec_data); nextbusn = rcec->rcec_ea->nextbusn; lastbusn = rcec->rcec_ea->lastbusn; /* All RCiEP devices are on the same bus as the RCEC */ if (nextbusn == 0xff && lastbusn == 0x00) return; for (bnr = nextbusn; bnr <= lastbusn; bnr++) { /* No association indicated (PCIe 5.0-1, 7.9.10.3) */ if (bnr == rcec->bus->number) continue; bus = pci_find_bus(pci_domain_nr(rcec->bus), bnr); if (!bus) continue; /* Find RCiEP devices on the given bus ranges */ pci_walk_bus(bus, cb, rcec_data); } } /** * pcie_link_rcec - Link RCiEP devices associated with RCEC. * @rcec: RCEC whose RCiEP devices should be linked. * * Link the given RCEC to each RCiEP device found. */ void pcie_link_rcec(struct pci_dev *rcec) { struct walk_rcec_data rcec_data; if (!rcec->rcec_ea) return; rcec_data.rcec = rcec; rcec_data.user_callback = NULL; rcec_data.user_data = NULL; walk_rcec(link_rcec_helper, &rcec_data); } /** * pcie_walk_rcec - Walk RCiEP devices associating with RCEC and call callback. * @rcec: RCEC whose RCiEP devices should be walked * @cb: Callback to be called for each RCiEP device found * @userdata: Arbitrary pointer to be passed to callback * * Walk the given RCEC. Call the callback on each RCiEP found. * * If @cb returns anything other than 0, break out. */ void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata) { struct walk_rcec_data rcec_data; if (!rcec->rcec_ea) return; rcec_data.rcec = rcec; rcec_data.user_callback = cb; rcec_data.user_data = userdata; walk_rcec(walk_rcec_helper, &rcec_data); } void pci_rcec_init(struct pci_dev *dev) { struct rcec_ea *rcec_ea; u32 rcec, hdr, busn; u8 ver; /* Only for Root Complex Event Collectors */ if (pci_pcie_type(dev) != PCI_EXP_TYPE_RC_EC) return; rcec = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_RCEC); if (!rcec) return; rcec_ea = kzalloc(sizeof(*rcec_ea), GFP_KERNEL); if (!rcec_ea) return; pci_read_config_dword(dev, rcec + PCI_RCEC_RCIEP_BITMAP, &rcec_ea->bitmap); /* Check whether RCEC BUSN register is present */ pci_read_config_dword(dev, rcec, &hdr); ver = PCI_EXT_CAP_VER(hdr); if (ver >= PCI_RCEC_BUSN_REG_VER) { pci_read_config_dword(dev, rcec + PCI_RCEC_BUSN, &busn); rcec_ea->nextbusn = PCI_RCEC_BUSN_NEXT(busn); rcec_ea->lastbusn = PCI_RCEC_BUSN_LAST(busn); } else { /* Avoid later ver check by setting nextbusn */ rcec_ea->nextbusn = 0xff; rcec_ea->lastbusn = 0x00; } dev->rcec_ea = rcec_ea; } void pci_rcec_exit(struct pci_dev *dev) { kfree(dev->rcec_ea); dev->rcec_ea = NULL; } |
| 8 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | // SPDX-License-Identifier: GPL-2.0-or-later /* * History * 03-01-2007 Added forwarding for x.25 Andrew Hendry */ #define pr_fmt(fmt) "X25: " fmt #include <linux/if_arp.h> #include <linux/init.h> #include <linux/slab.h> #include <net/x25.h> LIST_HEAD(x25_forward_list); DEFINE_RWLOCK(x25_forward_list_lock); int x25_forward_call(struct x25_address *dest_addr, struct x25_neigh *from, struct sk_buff *skb, int lci) { struct x25_route *rt; struct x25_neigh *neigh_new = NULL; struct x25_forward *x25_frwd, *new_frwd; struct sk_buff *skbn; short same_lci = 0; int rc = 0; if ((rt = x25_get_route(dest_addr)) == NULL) goto out_no_route; if ((neigh_new = x25_get_neigh(rt->dev)) == NULL) { /* This shouldn't happen, if it occurs somehow * do something sensible */ goto out_put_route; } /* Avoid a loop. This is the normal exit path for a * system with only one x.25 iface and default route */ if (rt->dev == from->dev) { goto out_put_nb; } /* Remote end sending a call request on an already * established LCI? It shouldn't happen, just in case.. */ read_lock_bh(&x25_forward_list_lock); list_for_each_entry(x25_frwd, &x25_forward_list, node) { if (x25_frwd->lci == lci) { pr_warn("call request for lci which is already registered!, transmitting but not registering new pair\n"); same_lci = 1; } } read_unlock_bh(&x25_forward_list_lock); /* Save the forwarding details for future traffic */ if (!same_lci){ if ((new_frwd = kmalloc(sizeof(struct x25_forward), GFP_ATOMIC)) == NULL){ rc = -ENOMEM; goto out_put_nb; } new_frwd->lci = lci; new_frwd->dev1 = rt->dev; new_frwd->dev2 = from->dev; write_lock_bh(&x25_forward_list_lock); list_add(&new_frwd->node, &x25_forward_list); write_unlock_bh(&x25_forward_list_lock); } /* Forward the call request */ if ( (skbn = skb_clone(skb, GFP_ATOMIC)) == NULL){ goto out_put_nb; } x25_transmit_link(skbn, neigh_new); rc = 1; out_put_nb: x25_neigh_put(neigh_new); out_put_route: x25_route_put(rt); out_no_route: return rc; } int x25_forward_data(int lci, struct x25_neigh *from, struct sk_buff *skb) { struct x25_forward *frwd; struct net_device *peer = NULL; struct x25_neigh *nb; struct sk_buff *skbn; int rc = 0; read_lock_bh(&x25_forward_list_lock); list_for_each_entry(frwd, &x25_forward_list, node) { if (frwd->lci == lci) { /* The call is established, either side can send */ if (from->dev == frwd->dev1) { peer = frwd->dev2; } else { peer = frwd->dev1; } break; } } read_unlock_bh(&x25_forward_list_lock); if ( (nb = x25_get_neigh(peer)) == NULL) goto out; if ( (skbn = pskb_copy(skb, GFP_ATOMIC)) == NULL){ goto output; } x25_transmit_link(skbn, nb); rc = 1; output: x25_neigh_put(nb); out: return rc; } void x25_clear_forward_by_lci(unsigned int lci) { struct x25_forward *fwd, *tmp; write_lock_bh(&x25_forward_list_lock); list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) { if (fwd->lci == lci) { list_del(&fwd->node); kfree(fwd); } } write_unlock_bh(&x25_forward_list_lock); } void x25_clear_forward_by_dev(struct net_device *dev) { struct x25_forward *fwd, *tmp; write_lock_bh(&x25_forward_list_lock); list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) { if ((fwd->dev1 == dev) || (fwd->dev2 == dev)){ list_del(&fwd->node); kfree(fwd); } } write_unlock_bh(&x25_forward_list_lock); } |
| 4 2 2 2 2 1 1 1 1 1 1 1 3 3 3 3 4 4 5 3 2 1 1 1 2 167 4 4 2 94 94 94 94 94 94 94 94 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the interrupt descriptor management code. Detailed * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "internals.h" /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with * bugreports caused by random commandline masks */ cpumask_set_cpu(smp_processor_id(), irq_default_affinity); return 1; } __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) { } #endif #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, int node) { if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, GFP_KERNEL, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, GFP_KERNEL, node)) { free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif return 0; } static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { if (!affinity) affinity = irq_default_affinity; cpumask_copy(desc->irq_common_data.affinity, affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif #ifdef CONFIG_NUMA desc->irq_common_data.node = node; #endif } static void free_masks(struct irq_desc *desc) { #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif } #else static inline int alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } static inline void free_masks(struct irq_desc *desc) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, const struct cpumask *affinity, struct module *owner) { int cpu; desc->irq_common_data.handler_data = NULL; desc->irq_common_data.msi_desc = NULL; desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; desc_smp_init(desc, node, affinity); } static unsigned int nr_irqs = NR_IRQS; /** * irq_get_nr_irqs() - Number of interrupts supported by the system. */ unsigned int irq_get_nr_irqs(void) { return nr_irqs; } EXPORT_SYMBOL_GPL(irq_get_nr_irqs); /** * irq_set_nr_irqs() - Set the number of interrupts supported by the system. * @nr: New number of interrupts. * * Return: @nr. */ unsigned int irq_set_nr_irqs(unsigned int nr) { nr_irqs = nr; return nr; } EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU, sparse_irq_lock); static int irq_find_free_area(unsigned int from, unsigned int cnt) { MA_STATE(mas, &sparse_irqs, 0, 0); if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt)) return -ENOSPC; return mas.index; } static unsigned int irq_find_at_or_after(unsigned int offset) { unsigned long index = offset; struct irq_desc *desc; guard(rcu)(); desc = mt_find(&sparse_irqs, &index, nr_irqs); return desc ? irq_desc_get_irq(desc) : nr_irqs; } static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) { MA_STATE(mas, &sparse_irqs, irq, irq); WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0); } static void delete_irq_desc(unsigned int irq) { MA_STATE(mas, &sparse_irqs, irq, irq); mas_erase(&mas); } #ifdef CONFIG_SPARSE_IRQ static const struct kobj_type irq_kobj_type; #endif static int init_desc(struct irq_desc *desc, int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { desc->kstat_irqs = alloc_percpu(struct irqstat); if (!desc->kstat_irqs) return -ENOMEM; if (alloc_masks(desc, node)) { free_percpu(desc->kstat_irqs); return -ENOMEM; } raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); irq_resend_init(desc); #ifdef CONFIG_SPARSE_IRQ kobject_init(&desc->kobj, &irq_kobj_type); init_rcu_head(&desc->rcu); #endif return 0; } #ifdef CONFIG_SPARSE_IRQ static void irq_kobj_release(struct kobject *kobj); #ifdef CONFIG_SYSFS static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; char *p = ""; int cpu; for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); ret += sysfs_emit_at(buf, ret, "%s%u", p, c); p = ","; } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip && desc->irq_data.chip->name) return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name); return 0; } IRQ_ATTR_RO(chip_name); static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.domain) return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq); return 0; } IRQ_ATTR_RO(hwirq); static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); } IRQ_ATTR_RO(type); static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); } IRQ_ATTR_RO(wakeup); static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->name) return sysfs_emit(buf, "%s\n", desc->name); return 0; } IRQ_ATTR_RO(name); static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; scoped_guard(raw_spinlock_irq, &desc->lock) { for_each_action_of_desc(desc, action) { ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name); p = ","; } } if (ret) ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(actions); static struct attribute *irq_attrs[] = { &per_cpu_count_attr.attr, &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL }; ATTRIBUTE_GROUPS(irq); static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) { if (irq_kobj_base) { /* * Continue even in case of failure as this is nothing * crucial and failures in the late irq_sysfs_init() * cannot be rolled back. */ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) pr_warn("Failed to add kobject for irq %d\n", irq); else desc->istate |= IRQS_SYSFS; } } static void irq_sysfs_del(struct irq_desc *desc) { /* * Only invoke kobject_del() when kobject_add() was successfully * invoked for the descriptor. This covers both early boot, where * sysfs is not initialized yet, and the case of a failed * kobject_add() invocation. */ if (desc->istate & IRQS_SYSFS) kobject_del(&desc->kobj); } static int __init irq_sysfs_init(void) { struct irq_desc *desc; int irq; /* Prevent concurrent irq alloc/free */ guard(mutex)(&sparse_irq_lock); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); if (!irq_kobj_base) return -ENOMEM; /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); return 0; } postcore_initcall(irq_sysfs_init); #else /* !CONFIG_SYSFS */ static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ struct irq_desc *irq_to_desc(unsigned int irq) { return mtree_load(&sparse_irqs, irq); } #ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif void irq_lock_sparse(void) { mutex_lock(&sparse_irq_lock); } void irq_unlock_sparse(void) { mutex_unlock(&sparse_irq_lock); } static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int ret; desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; ret = init_desc(desc, irq, node, flags, affinity, owner); if (unlikely(ret)) { kfree(desc); return NULL; } return desc; } static void irq_kobj_release(struct kobject *kobj) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } static void delayed_free_desc(struct rcu_head *rhp) { struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); kobject_put(&desc->kobj); } static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); /* * sparse_irq_lock protects also show_interrupts() and * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. * * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ irq_sysfs_del(desc); delete_irq_desc(irq); /* * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { struct irq_desc *desc; int i; /* Validate affinity mask(s) */ if (affinity) { for (i = 0; i < cnt; i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } for (i = 0; i < cnt; i++) { const struct cpumask *mask = NULL; unsigned int flags = 0; if (affinity) { if (affinity->is_managed) { flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); irq_add_debugfs_entry(start + i, desc); } return start; err: for (i--; i >= 0; i--) free_desc(start + i); return -ENOMEM; } static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) return false; nr_irqs = nr; return true; } int __init early_irq_init(void) { int i, initcnt, node = first_online_node; struct irq_desc *desc; init_irq_default_affinity(); /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) nr_irqs = MAX_SPARSE_IRQS; if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) initcnt = MAX_SPARSE_IRQS; if (initcnt > nr_irqs) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); irq_insert_desc(i, desc); } return arch_early_irq_init(); } #else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; int __init early_irq_init(void) { int count, i, node = first_online_node; int ret; init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL); if (unlikely(ret)) goto __free_desc_res; } return arch_early_irq_init(); __free_desc_res: while (--i >= 0) { free_masks(irq_desc + i); free_percpu(irq_desc[i].kstat_irqs); } return ret; } struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); scoped_guard(raw_spinlock_irqsave, &desc->lock) desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); delete_irq_desc(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { u32 i; for (i = 0; i < cnt; i++) { struct irq_desc *desc = irq_to_desc(start + i); desc->owner = owner; irq_insert_desc(start + i, desc); } return start; } static inline bool irq_expand_nr_irqs(unsigned int nr) { return false; } void irq_mark_irq(unsigned int irq) { guard(mutex)(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); } #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq) { free_desc(irq); } #endif #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) { struct irq_data *data; if (!desc) return -EINVAL; data = irq_desc_get_irq_data(desc); if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); return 0; } /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_irq(unsigned int irq) { return handle_irq_desc(irq_to_desc(irq)); } EXPORT_SYMBOL_GPL(generic_handle_irq); /** * generic_handle_irq_safe - Invoke the handler for a particular irq from any * context. * @irq: The irq number to handle * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process context). It * will report an error if not invoked from IRQ context and the irq has been * marked to enforce IRQ-context only. */ int generic_handle_irq_safe(unsigned int irq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_to_desc(irq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_irq_safe); #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); /** * generic_handle_irq_safe - Invoke the handler for a HW irq belonging * to a domain from any context. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an NMI context with irq regs * initialized. **/ int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } #endif /* Dynamic interrupt handling */ /** * irq_free_descs - free irq descriptors * @from: Start of descriptor range * @cnt: Number of consecutive irqs to free */ void irq_free_descs(unsigned int from, unsigned int cnt) { int i; if (from >= nr_irqs || (from + cnt) > nr_irqs) return; guard(mutex)(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); } EXPORT_SYMBOL_GPL(irq_free_descs); /** * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) * @affinity: Optional pointer to an affinity mask array of size @cnt which * hints where the irq descriptors should be allocated and which * default affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity) { int start; if (!cnt) return -EINVAL; if (irq >= 0) { if (from > irq) return -EINVAL; from = irq; } else { /* * For interrupts which are freely allocated the * architecture can force a lower bound to the @from * argument. x86 uses this to exclude the GSI space. */ from = arch_dynirq_lower_bound(from); } guard(mutex)(&sparse_irq_lock); start = irq_find_free_area(from, cnt); if (irq >=0 && start != irq) return -EEXIST; if (start + cnt > nr_irqs) { if (!irq_expand_nr_irqs(start + cnt)) return -ENOMEM; } return alloc_descs(start, cnt, node, affinity, owner); } EXPORT_SYMBOL_GPL(__irq_alloc_descs); /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * * Returns next irq number after offset or nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { return irq_find_at_or_after(offset); } struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check) { struct irq_desc *desc; desc = irq_to_desc(irq); if (!desc) return NULL; if (check & _IRQ_DESC_CHECK) { if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) return NULL; if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) return NULL; } if (bus) chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, *flags); return desc; } void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) __releases(&desc->lock) { raw_spin_unlock_irqrestore(&desc->lock, flags); if (bus) chip_bus_sync_unlock(desc); } int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || desc->percpu_enabled) return -EINVAL; desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); if (!desc->percpu_enabled) return -ENOMEM; desc->percpu_affinity = affinity ? : cpu_possible_mask; irq_set_percpu_devid_flags(irq); return 0; } int irq_set_percpu_devid(unsigned int irq) { return irq_set_percpu_devid_partition(irq, NULL); } int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->percpu_enabled) return -EINVAL; if (affinity) cpumask_copy(affinity, desc->percpu_affinity); return 0; } EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu * @irq: The interrupt number * @cpu: The cpu number * * Returns the sum of interrupt counts on @cpu since boot for * @irq. The caller must ensure that the interrupt is not removed * concurrently. */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) return data_race(desc->tot_count); for_each_cpu(cpu, cpumask) sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu)); return sum; } static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return kstat_irqs_desc(desc, cpu_possible_mask); } #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT void kstat_snapshot_irqs(void) { struct irq_desc *desc; unsigned int irq; for_each_irq_desc(irq, desc) { if (!desc->kstat_irqs) continue; this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt)); } } unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref); } #endif /** * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. * * It uses rcu to protect the access since a concurrent removal of an * interrupt descriptor is observing an rcu grace period before * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; rcu_read_lock(); sum = kstat_irqs(irq); rcu_read_unlock(); return sum; } #ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { lockdep_set_class(&desc->lock, lock_class); lockdep_set_class(&desc->request_mutex, request_class); } } EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); #endif |
| 59 2 327 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 | /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in a * new subdirectory with this name. Additionally when a * group is named, @is_visible and @is_bin_visible may * return SYSFS_GROUP_INVISIBLE to control visibility of * the directory itself. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for * each non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned * value will replace static permissions defined in struct * attribute. Use SYSFS_GROUP_VISIBLE() when assigning this * callback to specify separate _group_visible() and * _attr_visible() handlers. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC (and the * visibility flags for named groups) are accepted. Must * return 0 if a binary attribute is not visible. The * returned value will replace static permissions defined * in struct bin_attribute. If @is_visible is not set, Use * SYSFS_GROUP_VISIBLE() when assigning this callback to * specify separate _group_visible() and _attr_visible() * handlers. * @bin_size: * Optional: Function to return the size of a binary attribute * of the group. Will be called repeatedly for each binary * attribute in the group. Overwrites the size field embedded * inside the attribute itself. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, const struct bin_attribute *, int); size_t (*bin_size)(struct kobject *, const struct bin_attribute *, int); struct attribute **attrs; union { const struct bin_attribute *const *bin_attrs; const struct bin_attribute *const *bin_attrs_new; }; }; #define SYSFS_PREALLOC 010000 #define SYSFS_GROUP_INVISIBLE 020000 /* * DEFINE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with the assignment of ".is_visible = * SYSFS_GROUP_VISIBLE(name)", that arranges for the directory * associated with a named attribute_group to optionally be hidden. * This allows for static declaration of attribute_groups, and the * simplification of attribute visibility lifetime that implies, * without polluting sysfs with empty attribute directories. * Ex. * * static umode_t example_attr_visible(struct kobject *kobj, * struct attribute *attr, int n) * { * if (example_attr_condition) * return 0; * else if (ro_attr_condition) * return 0444; * return a->mode; * } * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; * * Note that it expects <name>_attr_visible and <name>_group_visible to * be defined. For cases where individual attributes do not need * separate visibility consideration, only entire group visibility at * once, see DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(). */ #define DEFINE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } /* * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name): * A helper macro to pair with SYSFS_GROUP_VISIBLE() that like * DEFINE_SYSFS_GROUP_VISIBLE() controls group visibility, but does * not require the implementation of a per-attribute visibility * callback. * Ex. * * static bool example_group_visible(struct kobject *kobj) * { * if (example_group_condition) * return false; * return true; * } * * DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(example); * * static struct attribute_group example_group = { * .name = "example", * .is_visible = SYSFS_GROUP_VISIBLE(example), * .attrs = &example_attrs, * }; */ #define DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, struct attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } /* * Same as DEFINE_SYSFS_GROUP_VISIBLE, but for groups with only binary * attributes. If an attribute_group defines both text and binary * attributes, the group visibility is determined by the function * specified to is_visible() not is_bin_visible() */ #define DEFINE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *attr, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return name##_attr_visible(kobj, attr, n); \ } #define DEFINE_SIMPLE_SYSFS_BIN_GROUP_VISIBLE(name) \ static inline umode_t sysfs_group_visible_##name( \ struct kobject *kobj, const struct bin_attribute *a, int n) \ { \ if (n == 0 && !name##_group_visible(kobj)) \ return SYSFS_GROUP_INVISIBLE; \ return a->mode; \ } #define SYSFS_GROUP_VISIBLE(fn) sysfs_group_visible_##fn /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .bin_attrs_new = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct address_space; struct bin_attribute { struct attribute attr; size_t size; void *private; struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write_new)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _read, \ .write = _write, \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) \ __BIN_ATTR(_name, 0444, _name##_read, NULL, _size) #define __BIN_ATTR_WO(_name, _size) \ __BIN_ATTR(_name, 0200, NULL, _name##_write, _size) #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) #define __BIN_ATTR_ADMIN_RO(_name, _size) \ __BIN_ATTR(_name, 0400, _name##_read, NULL, _size) #define __BIN_ATTR_ADMIN_RW(_name, _size) \ __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) #define BIN_ATTR_ADMIN_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) #define BIN_ATTR_ADMIN_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) #define __BIN_ATTR_SIMPLE_RO(_name, _mode) \ __BIN_ATTR(_name, _mode, sysfs_bin_attr_simple_read, NULL, 0) #define BIN_ATTR_SIMPLE_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444) #define BIN_ATTR_SIMPLE_ADMIN_RO(_name) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * V9FS definitions. * * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #ifndef FS_9P_V9FS_H #define FS_9P_V9FS_H #include <linux/backing-dev.h> #include <linux/netfs.h> /** * enum p9_session_flags - option flags for each 9P session * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. * @V9FS_ACCESS_ANY: use a single attach for all users * @V9FS_ACCESS_MASK: bit mask of different ACCESS options * @V9FS_POSIX_ACL: POSIX ACLs are enforced * * Session flags reflect options selected by users at mount time */ #define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \ V9FS_ACCESS_USER | \ V9FS_ACCESS_CLIENT) #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY #define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { V9FS_PROTO_2000U = 0x01, V9FS_PROTO_2000L = 0x02, V9FS_ACCESS_SINGLE = 0x04, V9FS_ACCESS_USER = 0x08, V9FS_ACCESS_CLIENT = 0x10, V9FS_POSIX_ACL = 0x20, V9FS_NO_XATTR = 0x40, V9FS_IGNORE_QV = 0x80, /* ignore qid.version for cache hints */ V9FS_DIRECT_IO = 0x100, V9FS_SYNC = 0x200 }; /** * enum p9_cache_shortcuts - human readable cache preferences * @CACHE_SC_NONE: disable all caches * @CACHE_SC_READAHEAD: only provide caching for readahead * @CACHE_SC_MMAP: provide caching to enable mmap * @CACHE_SC_LOOSE: non-coherent caching for files and meta data * @CACHE_SC_FSCACHE: persistent non-coherent caching for files and meta-data * */ enum p9_cache_shortcuts { CACHE_SC_NONE = 0b00000000, CACHE_SC_READAHEAD = 0b00000001, CACHE_SC_MMAP = 0b00000101, CACHE_SC_LOOSE = 0b00001111, CACHE_SC_FSCACHE = 0b10001111, }; /** * enum p9_cache_bits - possible values of ->cache * @CACHE_NONE: caches disabled * @CACHE_FILE: file caching (open to close) * @CACHE_META: meta-data and directory caching * @CACHE_WRITEBACK: write-back caching for files * @CACHE_LOOSE: don't check cache consistency * @CACHE_FSCACHE: local persistent caches * */ enum p9_cache_bits { CACHE_NONE = 0b00000000, CACHE_FILE = 0b00000001, CACHE_META = 0b00000010, CACHE_WRITEBACK = 0b00000100, CACHE_LOOSE = 0b00001000, CACHE_FSCACHE = 0b10000000, }; /** * struct v9fs_session_info - per-instance session information * @flags: session options of type &p9_session_flags * @nodev: set to 1 to disable device mapping * @debug: debug level * @afid: authentication handle * @cache: cache mode of type &p9_cache_bits * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache * @uname: string user name to mount hierarchy as * @aname: mount specifier for remote hierarchy * @maxdata: maximum data to be sent/recvd per protocol message * @dfltuid: default numeric userid to mount hierarchy as * @dfltgid: default numeric groupid to mount hierarchy as * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy * @clnt: reference to 9P network client instantiated for this session * @slist: reference to list of registered 9p sessions * * This structure holds state for each session instance established during * a sys_mount() . * * Bugs: there seems to be a lot of state which could be condensed and/or * removed. */ struct v9fs_session_info { /* options */ unsigned int flags; unsigned char nodev; unsigned short debug; unsigned int afid; unsigned int cache; #ifdef CONFIG_9P_FSCACHE char *cachetag; struct fscache_volume *fscache; #endif char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ kuid_t dfltuid; /* default uid/muid for legacy support */ kgid_t dfltgid; /* default gid for legacy support */ kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct p9_qid qid; unsigned int cache_validity; struct mutex v_mutex; }; static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { return container_of(inode, struct v9fs_inode, netfs.inode); } static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE return netfs_i_cookie(&v9inode->netfs); #else return NULL; #endif } static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info *v9ses) { #ifdef CONFIG_9P_FSCACHE return v9ses->fscache; #else return NULL; #endif } extern int v9fs_show_options(struct seq_file *m, struct dentry *root); struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data); extern void v9fs_session_close(struct v9fs_session_info *v9ses); extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern const struct netfs_request_ops v9fs_req_ops; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); /* other default globals */ #define V9FS_PORT 564 #define V9FS_DEFUSER "nobody" #define V9FS_DEFANAME "" #define V9FS_DEFUID KUIDT_INIT(-2) #define V9FS_DEFGID KGIDT_INIT(-2) static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) { return inode->i_sb->s_fs_info; } static inline struct v9fs_session_info *v9fs_dentry2v9ses(const struct dentry *dentry) { return dentry->d_sb->s_fs_info; } static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000U; } static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000L; } /** * v9fs_get_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); else return v9fs_inode_from_fid(v9ses, fid, sb, 0); } /** * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); else return v9fs_inode_from_fid(v9ses, fid, sb, 1); } #endif |
| 145 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | // SPDX-License-Identifier: GPL-2.0 /* RTT/RTO calculation. * * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com) * * https://tools.ietf.org/html/rfc6298 * https://tools.ietf.org/html/rfc1122#section-4.2.3.1 * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf */ #include <linux/net.h> #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) #define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* * Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (call->mdev_us >> 2); /* similar update on mdev */ } call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (call->mdev_us > call->mdev_max_us) { call->mdev_max_us = call->mdev_us; if (call->mdev_max_us > call->rttvar_us) call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ call->mdev_us = m << 1; /* make sure rto = 3*rtt */ call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); call->mdev_max_us = call->rttvar_us; } call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; /* 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ call->rto_us = rxrpc_bound_rto(rto); } static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, (u32)rtt_us ? : jiffies_to_usecs(1)); } static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ rxrpc_update_rtt_min(call, resp_time, rtt_us); rxrpc_rtt_estimator(call, rtt_us); rxrpc_set_rto(call); /* Only reset backoff on valid RTT measurement [RFC6298]. */ call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has * exclusive access to the call RTT data. */ void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; rxrpc_ack_update_rtt(call, resp_time, rtt_us); if (call->rtt_count < 3) call->rtt_count++; call->rtt_taken++; WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, rtt_us, call->srtt_us, call->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { u64 timo_us; u32 backoff = READ_ONCE(call->backoff); timo_us = call->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) WRITE_ONCE(call->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; return ns_to_ktime(timo_us * NSEC_PER_USEC); } void rxrpc_call_init_rtt(struct rxrpc_call *call) { call->rtt_last_req = KTIME_MIN; call->rto_us = RXRPC_TIMEOUT_INIT; call->mdev_us = RXRPC_TIMEOUT_INIT; call->backoff = 0; //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); } |
| 648 647 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | // SPDX-License-Identifier: GPL-2.0-or-later /* * SHA-256 optimized for x86_64 * * Copyright 2025 Google LLC */ #include <asm/fpu/api.h> #include <crypto/internal/sha2.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/static_call.h> asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86); DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3); void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks) { if (static_branch_likely(&have_sha256_x86)) { kernel_fpu_begin(); static_call(sha256_blocks_x86)(state, data, nblocks); kernel_fpu_end(); } else { sha256_blocks_generic(state, data, nblocks); } } EXPORT_SYMBOL_GPL(sha256_blocks_simd); void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks) { sha256_blocks_generic(state, data, nblocks); } EXPORT_SYMBOL_GPL(sha256_blocks_arch); bool sha256_is_arch_optimized(void) { return static_key_enabled(&have_sha256_x86); } EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); static int __init sha256_x86_mod_init(void) { if (boot_cpu_has(X86_FEATURE_SHA_NI)) { static_call_update(sha256_blocks_x86, sha256_ni_transform); } else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && boot_cpu_has(X86_FEATURE_AVX)) { if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) static_call_update(sha256_blocks_x86, sha256_transform_rorx); else static_call_update(sha256_blocks_x86, sha256_transform_avx); } else if (!boot_cpu_has(X86_FEATURE_SSSE3)) { return 0; } static_branch_enable(&have_sha256_x86); return 0; } subsys_initcall(sha256_x86_mod_init); static void __exit sha256_x86_mod_exit(void) { } module_exit(sha256_x86_mod_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-256 optimized for x86_64"); |
| 213 213 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM jbd2 #if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_JBD2_H #include <linux/jbd2.h> #include <linux/tracepoint.h> struct transaction_chp_stats_s; struct transaction_run_stats_s; TRACE_EVENT(jbd2_checkpoint, TP_PROTO(journal_t *journal, int result), TP_ARGS(journal, result), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, result ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->result = result; ), TP_printk("dev %d,%d result %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( __field( dev_t, dev ) __field( char, sync_commit ) __field( tid_t, transaction ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), TP_printk("dev %d,%d transaction %u sync %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_locking, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_flushing, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_commit_logging, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction) ); TRACE_EVENT(jbd2_end_commit, TP_PROTO(journal_t *journal, transaction_t *commit_transaction), TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( __field( dev_t, dev ) __field( char, sync_commit ) __field( tid_t, transaction ) __field( tid_t, head ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; __entry->head = journal->j_tail_sequence; ), TP_printk("dev %d,%d transaction %u sync %d head %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); DECLARE_EVENT_CLASS(jbd2_handle_start_class, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, requested_blocks) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->requested_blocks = requested_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u " "requested_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->requested_blocks) ); DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_start, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks) ); DEFINE_EVENT(jbd2_handle_start_class, jbd2_handle_restart, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int requested_blocks), TP_ARGS(dev, tid, type, line_no, requested_blocks) ); TRACE_EVENT(jbd2_handle_extend, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int buffer_credits, int requested_blocks), TP_ARGS(dev, tid, type, line_no, buffer_credits, requested_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, buffer_credits ) __field( int, requested_blocks) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->buffer_credits = buffer_credits; __entry->requested_blocks = requested_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u " "buffer_credits %d requested_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->buffer_credits, __entry->requested_blocks) ); TRACE_EVENT(jbd2_handle_stats, TP_PROTO(dev_t dev, tid_t tid, unsigned int type, unsigned int line_no, int interval, int sync, int requested_blocks, int dirtied_blocks), TP_ARGS(dev, tid, type, line_no, interval, sync, requested_blocks, dirtied_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned int, type ) __field( unsigned int, line_no ) __field( int, interval ) __field( int, sync ) __field( int, requested_blocks) __field( int, dirtied_blocks ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->type = type; __entry->line_no = line_no; __entry->interval = interval; __entry->sync = sync; __entry->requested_blocks = requested_blocks; __entry->dirtied_blocks = dirtied_blocks; ), TP_printk("dev %d,%d tid %u type %u line_no %u interval %d " "sync %d requested_blocks %d dirtied_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, __entry->type, __entry->line_no, __entry->interval, __entry->sync, __entry->requested_blocks, __entry->dirtied_blocks) ); TRACE_EVENT(jbd2_run_stats, TP_PROTO(dev_t dev, tid_t tid, struct transaction_run_stats_s *stats), TP_ARGS(dev, tid, stats), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned long, wait ) __field( unsigned long, request_delay ) __field( unsigned long, running ) __field( unsigned long, locked ) __field( unsigned long, flushing ) __field( unsigned long, logging ) __field( __u32, handle_count ) __field( __u32, blocks ) __field( __u32, blocks_logged ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->wait = stats->rs_wait; __entry->request_delay = stats->rs_request_delay; __entry->running = stats->rs_running; __entry->locked = stats->rs_locked; __entry->flushing = stats->rs_flushing; __entry->logging = stats->rs_logging; __entry->handle_count = stats->rs_handle_count; __entry->blocks = stats->rs_blocks; __entry->blocks_logged = stats->rs_blocks_logged; ), TP_printk("dev %d,%d tid %u wait %u request_delay %u running %u " "locked %u flushing %u logging %u handle_count %u " "blocks %u blocks_logged %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->request_delay), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), jiffies_to_msecs(__entry->flushing), jiffies_to_msecs(__entry->logging), __entry->handle_count, __entry->blocks, __entry->blocks_logged) ); TRACE_EVENT(jbd2_checkpoint_stats, TP_PROTO(dev_t dev, tid_t tid, struct transaction_chp_stats_s *stats), TP_ARGS(dev, tid, stats), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tid ) __field( unsigned long, chp_time ) __field( __u32, forced_to_close ) __field( __u32, written ) __field( __u32, dropped ) ), TP_fast_assign( __entry->dev = dev; __entry->tid = tid; __entry->chp_time = stats->cs_chp_time; __entry->forced_to_close= stats->cs_forced_to_close; __entry->written = stats->cs_written; __entry->dropped = stats->cs_dropped; ), TP_printk("dev %d,%d tid %u chp_time %u forced_to_close %u " "written %u dropped %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); TRACE_EVENT(jbd2_update_log_tail, TP_PROTO(journal_t *journal, tid_t first_tid, unsigned long block_nr, unsigned long freed), TP_ARGS(journal, first_tid, block_nr, freed), TP_STRUCT__entry( __field( dev_t, dev ) __field( tid_t, tail_sequence ) __field( tid_t, first_tid ) __field(unsigned long, block_nr ) __field(unsigned long, freed ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->tail_sequence = journal->j_tail_sequence; __entry->first_tid = first_tid; __entry->block_nr = block_nr; __entry->freed = freed; ), TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tail_sequence, __entry->first_tid, __entry->block_nr, __entry->freed) ); TRACE_EVENT(jbd2_write_superblock, TP_PROTO(journal_t *journal, blk_opf_t write_flags), TP_ARGS(journal, write_flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( blk_opf_t, write_flags ) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->write_flags = write_flags; ), TP_printk("dev %d,%d write_flags %x", MAJOR(__entry->dev), MINOR(__entry->dev), (__force u32)__entry->write_flags) ); TRACE_EVENT(jbd2_lock_buffer_stall, TP_PROTO(dev_t dev, unsigned long stall_ms), TP_ARGS(dev, stall_ms), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, stall_ms ) ), TP_fast_assign( __entry->dev = dev; __entry->stall_ms = stall_ms; ), TP_printk("dev %d,%d stall_ms %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->stall_ms) ); DECLARE_EVENT_CLASS(jbd2_journal_shrink, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, nr_to_scan) __field(unsigned long, count) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->nr_to_scan = nr_to_scan; __entry->count = count; ), TP_printk("dev %d,%d nr_to_scan %lu count %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->count) ); DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count) ); DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), TP_ARGS(journal, nr_to_scan, count) ); TRACE_EVENT(jbd2_shrink_scan_exit, TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long nr_shrunk, unsigned long count), TP_ARGS(journal, nr_to_scan, nr_shrunk, count), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long, nr_to_scan) __field(unsigned long, nr_shrunk) __field(unsigned long, count) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->nr_to_scan = nr_to_scan; __entry->nr_shrunk = nr_shrunk; __entry->count = count; ), TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->nr_shrunk, __entry->count) ); TRACE_EVENT(jbd2_shrink_checkpoint_list, TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, unsigned long nr_freed, tid_t next_tid), TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, next_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, first_tid) __field(tid_t, tid) __field(tid_t, last_tid) __field(unsigned long, nr_freed) __field(tid_t, next_tid) ), TP_fast_assign( __entry->dev = journal->j_fs_dev->bd_dev; __entry->first_tid = first_tid; __entry->tid = tid; __entry->last_tid = last_tid; __entry->nr_freed = nr_freed; __entry->next_tid = next_tid; ), TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " "next transaction %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->first_tid, __entry->tid, __entry->last_tid, __entry->nr_freed, __entry->next_tid) ); #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 26 18 19 1 1 18 8 8 19 18 7 5 4 1 4 14 14 14 2 12 14 14 11 14 14 1 13 11 13 13 1 13 13 13 13 13 13 13 13 1 2 26 2 1 1 2 8 8 1 8 8 8 8 8 8 8 1 1 1 4 4 13 286 5 5 5 5 5 5 5 5 6 6 5 5 5 6 609 609 7 609 609 609 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 | // SPDX-License-Identifier: GPL-2.0-or-later /* * TCP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: * linux/net/ipv4/tcp.c * linux/net/ipv4/tcp_input.c * linux/net/ipv4/tcp_output.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. */ #include <linux/bottom_half.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/jiffies.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/ipsec.h> #include <linux/times.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/indirect_call_wrapper.h> #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> #include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/inet_ecn.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/snmp.h> #include <net/dsfield.h> #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/hotdata.h> #include <net/busy_poll.h> #include <net/rstreason.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <crypto/hash.h> #include <linux/scatterlist.h> #include <trace/events/tcp.h> static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #endif /* Helper returning the inet6 address from a given tcp socket. * It can be used in TCP stack instead of inet6_sk(sk). * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic(). */ #define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ struct tcp6_sock, tcp)->inet6) static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } } static u32 tcp_v6_init_seq(const struct sk_buff *skb) { return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, tcp_hdr(skb)->dest, tcp_hdr(skb)->source); } static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) { return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); } static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to * prevent BPF program called below from accessing bytes that are out * of the bound specified by user in addr_len. */ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; sock_owned_by_me(sk); return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_connection_sock *icsk = inet_csk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct ipv6_txoptions *opt; struct dst_entry *dst; struct flowi6 fl6; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; memset(&fl6, 0, sizeof(fl6)); if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } } /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if (ipv6_addr_any(&usin->sin6_addr)) { if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide. */ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL; } if (tp->rx_opt.ts_recent_stamp && !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* * TCP over IPv4 */ if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; if (ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; } np->saddr = sk->sk_v6_rcv_saddr; return err; } if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { saddr = &fl6.saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); if (err) goto failure; } /* set the source address */ np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; sk_set_txhash(sk); if (likely(!tp->repair)) { if (!tp->write_seq) WRITE_ONCE(tp->write_seq, secure_tcpv6_seq(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); } if (tcp_fastopen_defer_connect(sk, &err)) return err; if (err) goto late_failure; err = tcp_connect(sk); if (err) goto late_failure; return 0; late_failure: tcp_set_state(sk, TCP_CLOSE); inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; return err; } static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; mtu = READ_ONCE(tcp_sk(sk)->mtu_info); /* Drop requests trying to increase our current mss. * Check done in __ip6_rt_update_pmtu() is too late. */ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) return; dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return; if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } } static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct net *net = dev_net_rcu(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; bool fatal; int err; sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { /* To increase the counter of ignored icmps for TCP-AO */ tcp_ao_ignore_icmp(sk, AF_INET6, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) { tcp_req_err(sk, seq, fatal); return 0; } if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) { sock_put(sk); return 0; } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } } tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = tcp_inet6_sk(sk); if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) dst->ops->redirect(dst, sk, skb); } goto out; } if (type == ICMPV6_PKT_TOOBIG) { u32 mtu = ntohl(info); /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). */ if (sk->sk_state == TCP_LISTEN) goto out; if (!ip6_sk_accept_pmtu(sk)) goto out; if (mtu < IPV6_MIN_MTU) goto out; WRITE_ONCE(tp->mtu_info, mtu); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); goto out; } /* Might be for an request_sock */ switch (sk->sk_state) { case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); if (!sock_owned_by_user(sk)) tcp_done_with_error(sk, err); else WRITE_ONCE(sk->sk_err_soft, err); goto out; case TCP_LISTEN: break; default: /* check if this ICMP message allows revert of backoff. * (see RFC 6069) */ if (!fastopen && type == ICMPV6_DEST_UNREACH && code == ICMPV6_NOROUTE) tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } out: bh_unlock_sock(sk); sock_put(sk); return 0; } static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, IPPROTO_TCP)) == NULL) goto done; skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (np->tclass & INET_ECN_MASK) : np->tclass; if (!INET_ECN_is_capable(tclass) && tcp_bpf_ca_needs_ecn((struct sock *)req)) tclass |= INET_ECN_ECT_0; rcu_read_lock(); opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), opt, tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); err = net_xmit_eval(err); } done: return err; } static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); consume_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr, int l3index) { return tcp_md5_do_lookup(sk, l3index, (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { int l3index; l3index = l3mdev_master_ifindex_by_index(sock_net(sk), addr_sk->sk_bound_dev_if); return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; union tcp_ao_addr *addr; int l3index = 0; u8 prefixlen; bool l3flag; u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin6->sin6_family != AF_INET6) return -EINVAL; flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) && prefixlen > 32)) return -EINVAL; } else { prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); if (dev && netif_is_l3_master(dev)) l3index = dev->ifindex; rcu_read_unlock(); /* ok to reference set/not set outside of rcu; * right now device MUST be an L3 master */ if (!dev || !l3index) return -EINVAL; } if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3]; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } addr = (union tcp_md5_addr *)&sin6->sin6_addr; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; struct tcphdr *_th; bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); _th = (struct tcphdr *)(bp + 1); memcpy(_th, th, sizeof(*th)); _th->check = 0; sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->req); } static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { struct tcp_sigpool hp; if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } static int tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); const struct in6_addr *saddr, *daddr; struct tcp_sigpool hp; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; daddr = &sk->sk_v6_daddr; } else { const struct ipv6hdr *ip6h = ipv6_hdr(skb); saddr = &ip6h->saddr; daddr = &ip6h->daddr; } if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } #endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb, u32 tw_isn) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ireq->ir_rmt_addr = LOOPBACK4_IPV6; ireq->ir_loc_addr = LOOPBACK4_IPV6; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) { refcount_inc(&skb->users); ireq->pktopts = skb; } } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn) { tcp_v6_init_req(req, sk, skb, tw_isn); if (security_inet_conn_request(sk, skb, req)) return NULL; return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup_rsk, .ao_calc_key = tcp_v6_ao_calc_key_rsk, .ao_synack_hash = tcp_v6_ao_synack_hash, #endif #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_seq, .init_ts_off = tcp_v6_init_ts_off, .send_synack = tcp_v6_send_synack, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); unsigned int tot_len = sizeof(struct tcphdr); struct sock *ctl_sk = net->ipv6.tcp_sk; const struct tcphdr *th = tcp_hdr(skb); __be32 mrst = 0, *topt; struct dst_entry *dst; struct sk_buff *buff; struct tcphdr *t1; struct flowi6 fl6; u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; if (tcp_key_is_md5(key)) tot_len += TCPOLEN_MD5SIG_ALIGNED; if (tcp_key_is_ao(key)) tot_len += tcp_ao_len_aligned(key->ao_key); #ifdef CONFIG_MPTCP if (rst && !tcp_key_is_md5(key)) { mrst = mptcp_reset_option(skb); if (mrst) tot_len += sizeof(__be32); } #endif buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (!buff) return; skb_reserve(buff, MAX_TCP_HEADER); t1 = skb_push(buff, tot_len); skb_reset_transport_header(buff); /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; t1->seq = htonl(seq); t1->ack_seq = htonl(ack); t1->ack = !rst || !th->ack; t1->rst = rst; t1->window = htons(win); topt = (__be32 *)(t1 + 1); if (tsecr) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *topt++ = htonl(tsval); *topt++ = htonl(tsecr); } if (mrst) *topt++ = mrst; #ifdef CONFIG_TCP_MD5SIG if (tcp_key_is_md5(key)) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); } #endif #ifdef CONFIG_TCP_AO if (tcp_key_is_ao(key)) { *topt++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (key->rcv_next)); tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key, key->traffic_key, (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr, (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr, t1, key->sne); } #endif memset(&fl6, 0, sizeof(fl6)); fl6.daddr = ipv6_hdr(skb)->saddr; fl6.saddr = ipv6_hdr(skb)->daddr; fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); else { if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; fl6.flowi6_oif = oif; } if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace */ if (sk && sk->sk_state != TCP_TIME_WAIT) dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/ else dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return; } kfree_skb(buff); } static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); const __u8 *md5_hash_location = NULL; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) bool allocated_traffic_key = false; #endif const struct tcp_ao_hdr *aoh; struct tcp_key key = {}; u32 seq = 0, ack_seq = 0; __be32 label = 0; u32 priority = 0; struct net *net; u32 txhash = 0; int oif = 0; #ifdef CONFIG_TCP_MD5SIG unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif if (th->rst) return; /* If sk not NULL, it means we did a successful lookup and incoming * route had to be correct. prequeue might have dropped our dst. */ if (!sk && !ipv6_unicast_destination(skb)) return; net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif #ifdef CONFIG_TCP_MD5SIG if (sk && sk_fullsock(sk)) { int l3index; /* sdif set, means packet ingressed via a device * in an L3 domain and inet_iif is set to it. */ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; } else if (md5_hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); int l3index; /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. * we are not loose security here: * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) goto out; /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to it. */ l3index = tcp_v6_sdif(skb) ? dif : 0; key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key.md5_key) goto out; key.type = TCP_KEY_MD5; genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) goto out; } #endif if (th->ack) seq = ntohl(th->ack_seq); else ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); #ifdef CONFIG_TCP_AO if (aoh) { int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq, &key.ao_key, &key.traffic_key, &allocated_traffic_key, &key.rcv_next, &key.sne)) goto out; key.type = TCP_KEY_AO; } #endif if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); txhash = sk->sk_txhash; } if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); priority = inet_twsk(sk)->tw_priority; txhash = inet_twsk(sk)->tw_txhash; } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } trace_tcp_send_reset(sk, skb, reason); tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK, label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) out: if (allocated_traffic_key) kfree(key.traffic_key); rcu_read_unlock(); #endif } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_key *key, u8 tclass, __be32 label, u32 priority, u32 txhash) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0, tclass, label, priority, txhash, key); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb, enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); u8 tclass = tw->tw_tclass; struct tcp_key key = {}; if (tw_status == TCP_TW_ACK_OOW) tclass &= ~INET_ECN_MASK; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; if (static_branch_unlikely(&tcp_ao_needed.key)) { /* FIXME: the segment to-be-acked is not verified yet */ ao_info = rcu_dereference(tcptw->ao_info); if (ao_info) { const struct tcp_ao_hdr *aoh; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) key.ao_key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1); } } if (key.ao_key) { struct tcp_ao_key *rnext_key; key.traffic_key = snd_other_key(key.ao_key); /* rcv_next switches to our rcv_next */ rnext_key = READ_ONCE(ao_info->rnext_key); key.rcv_next = rnext_key->rcvid; key.sne = READ_ONCE(ao_info->snd_sne); key.type = TCP_KEY_AO; #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO out: #endif inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_key key = {}; #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key) && tcp_rsk_used_ao(req)) { const struct in6_addr *addr = &ipv6_hdr(skb)->saddr; const struct tcp_ao_hdr *aoh; int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) return; if (!aoh) return; key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, aoh->rnext_keyid, -1); if (unlikely(!key.ao_key)) { /* Send ACK with any matching MKT for the peer */ key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, -1, -1); /* Matching key disappeared (user removed the key?) * let the handshake timeout. */ if (!key.ao_key) { net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n", addr, ntohs(tcp_hdr(skb)->source), &ipv6_hdr(skb)->daddr, ntohs(tcp_hdr(skb)->dest)); return; } } key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); if (!key.traffic_key) return; key.type = TCP_KEY_AO; key.rcv_next = aoh->keyid; tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), req->ts_recent, sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK, 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); } static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); if (!th->syn) sk = cookie_v6_check(sk, skb); #endif return sk; } u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie) { u16 mss = 0; #ifdef CONFIG_SYN_COOKIES mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, th); if (mss) { *cookie = __cookie_v6_init_sequence(iph, th, &mss); tcp_synq_overflow(sk); } #endif return mss; } static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void tcp_v6_restore_cb(struct sk_buff *skb) { /* We need to move header back to the beginning if xfrm6_policy_check() * and tcp_v6_fill_cb() are going to be called again. * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. */ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, sizeof(struct inet6_skb_parm)); } static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; int l3index; #endif struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* * v6 mapped */ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (!newsk) return NULL; inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; if (sk_is_mptcp(newsk)) mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; newnp->rcv_flowinfo = 0; if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = 0; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count * here, tcp_create_openreq_child now does this for us, see the comment in * that function for the gory details. -acme */ /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 icsk.icsk_af_ops. Sync it now. */ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto out_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) goto out; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the * comment in that function for the gory details. -acme */ newsk->sk_gso_type = SKB_GSO_TCPV6; inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); ip6_dst_store(newsk, dst, NULL, NULL); newnp->saddr = ireq->ir_v6_loc_addr; /* Now IPv6 options... First: no IPv4 options. */ newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever, but we make one more one thing there: reattach optmem to newsk. */ opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); } inet_csk(newsk)->icsk_ext_hdr_len = 0; if (opt) inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); if (!tcp_rsk_used_ao(req)) { /* Copy over the MD5 key from the original socket */ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { const union tcp_md5_addr *addr; addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } } } #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) goto out; /* OOM */ #endif if (__inet_inherit_port(sk, newsk) < 0) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); } } else { if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only */ bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; } } return newsk; out_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: tcp_listendrop(sk); return NULL; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; enum skb_drop_reason reason; struct tcp_sock *tp; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... Fortunately, tcp_rcv_established and rcv_established handle them correctly, but it is not case with tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK */ if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. */ /* Do Stevens' IPV6_PKTOPTIONS. Yes, guys, it is the only place in our code, where we may make it not affecting IPv4. The rest of code is protocol independent, and I do not like idea to uglify IPv4. Actually, all the idea behind IPV6_PKTOPTIONS looks not very well thought. For now we latch options, received in the last packet, enqueued by tcp. Feel free to propose better solution. --ANK (980728) */ if (np->rxopt.all && sk->sk_state != TCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; dst = rcu_dereference_protected(sk->sk_rx_dst, lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, dst, sk->sk_rx_dst_cookie) == NULL) { RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); } } tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; } if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); if (nsk != sk) { if (nsk) { reason = tcp_child_process(sk, nsk, skb); if (reason) goto reset; } return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); if (reason) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: if (opt_skb) __kfree_skb(opt_skb); sk_skb_reason_drop(sk, skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; ipv6_pktoptions: /* Do you ask, what is it? 1. skb was enqueued by tcp. 2. skb is added to tail of read queue, rather than out of order. 3. socket is not in passive state. 4. Finally, it really contains options, which user wants to receive. */ tp = tcp_sk(sk); if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb)); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { __kfree_skb(opt_skb); opt_skb = xchg(&np->pktoptions, NULL); } } consume_skb(opt_skb); return 0; } static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, const struct tcphdr *th) { /* This is tricky: we move IP6CB at its correct location into * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because * _decode_session6() uses IP6CB(). * barrier() makes sure compiler won't play aliasing games. */ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), sizeof(struct inet6_skb_parm)); barrier(); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; enum tcp_tw_status tw_status; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; struct sock *sk = NULL; bool refcounted; int ret; u32 isn; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) goto discard_it; /* * Count it even if it's bad. */ __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = (const struct tcphdr *)skb->data; if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto bad_packet; } if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) goto csum_error; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); lookup: sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) drop_reason = SKB_DROP_REASON_XFRM_POLICY; else drop_reason = tcp_inbound_hash(sk, req, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } if (tcp_checksum_complete(skb)) { reqsk_put(req); goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); if (!nsk) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sk = nsk; /* reuseport_migrate_sock() has already held one sk_refcnt * before returning. */ } else { sock_hold(sk); } refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen, &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); if (req_stolen) { /* Another cpu got exclusive access to req * and created a full blown socket. * Try to feed this packet to this socket * instead of discarding it. */ tcp_v6_restore_cb(skb); sock_put(sk); goto lookup; } goto discard_and_relse; } nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); } else { drop_reason = tcp_child_process(sk, nsk, skb); if (drop_reason) { enum sk_rst_reason rst_reason; rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v6_send_reset(nsk, skb, rst_reason); goto discard_and_relse; } sock_put(sk); return 0; } } process: if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; } drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); if (tcp_filter(sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; if (sk->sk_state == TCP_LISTEN) { ret = tcp_v6_do_rcv(sk, skb); goto put_and_return; } sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: if (refcounted) sock_put(sk); return ret ? -1 : 0; no_tcp_socket: drop_reason = SKB_DROP_REASON_NO_SOCKET; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { csum_error: drop_reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason)); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, &drop_reason); switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2; sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif_l3_slave(skb), sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; __this_cpu_write(tcp_tw_isn, isn); goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: case TCP_TW_ACK_OOW: tcp_v6_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: ; } goto discard_it; } void tcp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net_rcu(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) return; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) return; hdr = ipv6_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst && sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } } static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); } const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .mtu_reduced = tcp_v6_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v6_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v6_ao_calc_key_sk, #endif }; #endif /* * TCP over IPv4 via INET6 API */ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .mtu_reduced = tcp_v4_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v4_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; #endif /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif return 0; } #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; if (ttd < 0) ttd = 0; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], inet_rsk(req)->ir_num, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); } static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) { const struct in6_addr *dest, *src; __u16 destp, srcp; int timer_active; unsigned long timer_expires; const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; u8 icsk_pending; int rx_queue; int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); icsk_pending = smp_load_acquire(&icsk->icsk_pending); if (icsk_pending == ICSK_TIME_RETRANS || icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk_timeout(icsk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; } state = inet_sk_state_load(sp); if (state == TCP_LISTEN) rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp), tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { long delta = tw->tw_timer.expires - jiffies; const struct in6_addr *dest, *src; __u16 destp, srcp; dest = &tw->tw_v6_daddr; src = &tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; struct sock *sk = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, " sl " "local_address " "remote_address " "st tx_queue rx_queue tr tm->when retrnsmt" " uid timeout inode\n"); goto out; } st = seq->private; if (sk->sk_state == TCP_TIME_WAIT) get_timewait6_sock(seq, v, st->num); else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); else get_tcp6_sock(seq, v, st->num); out: return 0; } static const struct seq_operations tcp6_seq_ops = { .show = tcp6_seq_show, .start = tcp_seq_start, .next = tcp_seq_next, .stop = tcp_seq_stop, }; static struct tcp_seq_afinfo tcp6_seq_afinfo = { .family = AF_INET6, }; int __net_init tcp6_proc_init(struct net *net) { if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops, sizeof(struct tcp_iter_state), &tcp6_seq_afinfo)) return -ENOMEM; return 0; } void tcp6_proc_exit(struct net *net) { remove_proc_entry("tcp6", net->proc_net); } #endif struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; EXPORT_SYMBOL_GPL(tcpv6_prot); static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; static int __net_init tcpv6_net_init(struct net *net) { int res; res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, SOCK_RAW, IPPROTO_TCP, net); if (!res) net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC; return res; } static void __net_exit tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); } static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, }; int __init tcpv6_init(void) { int ret; net_hotdata.tcpv6_protocol = (struct inet6_protocol) { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); if (ret) goto out; /* register inet6 protocol */ ret = inet6_register_protosw(&tcpv6_protosw); if (ret) goto out_tcpv6_protocol; ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; ret = mptcpv6_init(); if (ret) goto out_tcpv6_pernet_subsys; out: return ret; out_tcpv6_pernet_subsys: unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); goto out; } void tcpv6_exit(void) { unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP); } |
| 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | /* SPDX-License-Identifier: GPL-2.0 */ /* * devres.c - managed gpio resources * This file is based on kernel/irq/devres.c * * Copyright (c) 2011 John Crispin <john@phrozen.org> */ #include <linux/device/devres.h> #include <linux/err.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/types.h> #include <linux/gpio/consumer.h> #include "gpiolib.h" struct fwnode_handle; struct lock_class_key; static void devm_gpiod_release(void *desc) { gpiod_put(desc); } static void devm_gpiod_release_array(void *descs) { gpiod_put_array(descs); } /** * devm_gpiod_get - Resource-managed gpiod_get() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get() for detailed * information about behavior and return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get(struct device *dev, const char *con_id, enum gpiod_flags flags) { return devm_gpiod_get_index(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(devm_gpiod_get); /** * devm_gpiod_get_optional - Resource-managed gpiod_get_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_optional(). GPIO descriptors returned from this function * are automatically disposed on driver detach. See gpiod_get_optional() for * detailed information about behavior and return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, NULL if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { return devm_gpiod_get_index_optional(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(devm_gpiod_get_optional); /** * devm_gpiod_get_index - Resource-managed gpiod_get_index() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @idx: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_index(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get_index() for detailed * information about behavior and return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev, const char *con_id, unsigned int idx, enum gpiod_flags flags) { struct gpio_desc *desc; int ret; desc = gpiod_get_index(dev, con_id, idx, flags); if (IS_ERR(desc)) return desc; /* * For non-exclusive GPIO descriptors, check if this descriptor is * already under resource management by this device. */ if (flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE) { bool dres; dres = devm_is_action_added(dev, devm_gpiod_release, desc); if (dres) return desc; } ret = devm_add_action_or_reset(dev, devm_gpiod_release, desc); if (ret) return ERR_PTR(ret); return desc; } EXPORT_SYMBOL_GPL(devm_gpiod_get_index); /** * devm_fwnode_gpiod_get_index - get a GPIO descriptor from a given node * @dev: GPIO consumer * @fwnode: firmware node containing GPIO reference * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: GPIO initialization flags * @label: label to attach to the requested GPIO * * GPIO descriptors returned from this function are automatically disposed on * driver detach. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, const char *label) { struct gpio_desc *desc; int ret; desc = gpiod_find_and_request(dev, fwnode, con_id, index, flags, label, false); if (IS_ERR(desc)) return desc; ret = devm_add_action_or_reset(dev, devm_gpiod_release, desc); if (ret) return ERR_PTR(ret); return desc; } EXPORT_SYMBOL_GPL(devm_fwnode_gpiod_get_index); /** * devm_gpiod_get_index_optional - Resource-managed gpiod_get_index_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_index_optional(). GPIO descriptors returned from this * function are automatically disposed on driver detach. See * gpiod_get_index_optional() for detailed information about behavior and * return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %NULL if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get_index_optional(struct device *dev, const char *con_id, unsigned int index, enum gpiod_flags flags) { struct gpio_desc *desc; desc = devm_gpiod_get_index(dev, con_id, index, flags); if (gpiod_not_found(desc)) return NULL; return desc; } EXPORT_SYMBOL_GPL(devm_gpiod_get_index_optional); /** * devm_gpiod_get_array - Resource-managed gpiod_get_array() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_array(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get_array() for detailed * information about behavior and return values. * * Returns: * The GPIO descriptors corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, * or another IS_ERR() code if an error occurred while trying to acquire * the GPIOs. */ struct gpio_descs *__must_check devm_gpiod_get_array(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs *descs; int ret; descs = gpiod_get_array(dev, con_id, flags); if (IS_ERR(descs)) return descs; ret = devm_add_action_or_reset(dev, devm_gpiod_release_array, descs); if (ret) return ERR_PTR(ret); return descs; } EXPORT_SYMBOL_GPL(devm_gpiod_get_array); /** * devm_gpiod_get_array_optional - Resource-managed gpiod_get_array_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_array_optional(). GPIO descriptors returned from this * function are automatically disposed on driver detach. * See gpiod_get_array_optional() for detailed information about behavior and * return values. * * Returns: * The GPIO descriptors corresponding to the function @con_id of device * dev, %NULL if no GPIO has been assigned to the requested function, * or another IS_ERR() code if an error occurred while trying to acquire * the GPIOs. */ struct gpio_descs *__must_check devm_gpiod_get_array_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs *descs; descs = devm_gpiod_get_array(dev, con_id, flags); if (gpiod_not_found(descs)) return NULL; return descs; } EXPORT_SYMBOL_GPL(devm_gpiod_get_array_optional); /** * devm_gpiod_put - Resource-managed gpiod_put() * @dev: GPIO consumer * @desc: GPIO descriptor to dispose of * * Dispose of a GPIO descriptor obtained with devm_gpiod_get() or * devm_gpiod_get_index(). Normally this function will not be called as the GPIO * will be disposed of by the resource management code. */ void devm_gpiod_put(struct device *dev, struct gpio_desc *desc) { devm_release_action(dev, devm_gpiod_release, desc); } EXPORT_SYMBOL_GPL(devm_gpiod_put); /** * devm_gpiod_unhinge - Remove resource management from a gpio descriptor * @dev: GPIO consumer * @desc: GPIO descriptor to remove resource management from * * *DEPRECATED* * This function should not be used. It's been provided as a workaround for * resource ownership issues in the regulator framework and should be replaced * with a better solution. * * Remove resource management from a GPIO descriptor. This is needed when * you want to hand over lifecycle management of a descriptor to another * mechanism. */ void devm_gpiod_unhinge(struct device *dev, struct gpio_desc *desc) { int ret; if (IS_ERR_OR_NULL(desc)) return; /* * If the GPIO descriptor is requested as nonexclusive, we * may call this function several times on the same descriptor * so it is OK if devres_destroy() returns -ENOENT. */ ret = devm_remove_action_nowarn(dev, devm_gpiod_release, desc); if (ret == -ENOENT) return; /* Anything else we should warn about */ WARN_ON(ret); } EXPORT_SYMBOL_GPL(devm_gpiod_unhinge); /** * devm_gpiod_put_array - Resource-managed gpiod_put_array() * @dev: GPIO consumer * @descs: GPIO descriptor array to dispose of * * Dispose of an array of GPIO descriptors obtained with devm_gpiod_get_array(). * Normally this function will not be called as the GPIOs will be disposed of * by the resource management code. */ void devm_gpiod_put_array(struct device *dev, struct gpio_descs *descs) { devm_remove_action(dev, devm_gpiod_release_array, descs); } EXPORT_SYMBOL_GPL(devm_gpiod_put_array); static void devm_gpio_chip_release(void *data) { struct gpio_chip *gc = data; gpiochip_remove(gc); } /** * devm_gpiochip_add_data_with_key() - Resource managed gpiochip_add_data_with_key() * @dev: pointer to the device that gpio_chip belongs to. * @gc: the GPIO chip to register * @data: driver-private data associated with this chip * @lock_key: lockdep class for IRQ lock * @request_key: lockdep class for IRQ request * * Context: potentially before irqs will work * * The gpio chip automatically be released when the device is unbound. * * Returns: * A negative errno if the chip can't be registered, such as because the * gc->base is invalid or already associated with a different chip. * Otherwise it returns zero as a success code. */ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, void *data, struct lock_class_key *lock_key, struct lock_class_key *request_key) { int ret; ret = gpiochip_add_data_with_key(gc, data, lock_key, request_key); if (ret < 0) return ret; return devm_add_action_or_reset(dev, devm_gpio_chip_release, gc); } EXPORT_SYMBOL_GPL(devm_gpiochip_add_data_with_key); |
| 316 316 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007, 2008, 2009 Oracle Corporation * Written by: Martin K. Petersen <martin.petersen@oracle.com> * * Automatically generate and verify integrity data on PI capable devices if the * bio submitter didn't provide PI itself. This ensures that kernel verifies * data integrity even if the file system (or other user of the block device) is * not aware of PI. */ #include <linux/blk-integrity.h> #include <linux/t10-pi.h> #include <linux/workqueue.h> #include "blk.h" struct bio_integrity_data { struct bio *bio; struct bvec_iter saved_bio_iter; struct work_struct work; struct bio_integrity_payload bip; struct bio_vec bvec; }; static struct kmem_cache *bid_slab; static mempool_t bid_pool; static struct workqueue_struct *kintegrityd_wq; static void bio_integrity_finish(struct bio_integrity_data *bid) { bid->bio->bi_integrity = NULL; bid->bio->bi_opf &= ~REQ_INTEGRITY; kfree(bvec_virt(bid->bip.bip_vec)); mempool_free(bid, &bid_pool); } static void bio_integrity_verify_fn(struct work_struct *work) { struct bio_integrity_data *bid = container_of(work, struct bio_integrity_data, work); struct bio *bio = bid->bio; blk_integrity_verify_iter(bio, &bid->saved_bio_iter); bio_integrity_finish(bid); bio_endio(bio); } #define BIP_CHECK_FLAGS (BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) static bool bip_should_check(struct bio_integrity_payload *bip) { return bip->bip_flags & BIP_CHECK_FLAGS; } static bool bi_offload_capable(struct blk_integrity *bi) { switch (bi->csum_type) { case BLK_INTEGRITY_CSUM_CRC64: return bi->tuple_size == sizeof(struct crc64_pi_tuple); case BLK_INTEGRITY_CSUM_CRC: case BLK_INTEGRITY_CSUM_IP: return bi->tuple_size == sizeof(struct t10_pi_tuple); default: pr_warn_once("%s: unknown integrity checksum type:%d\n", __func__, bi->csum_type); fallthrough; case BLK_INTEGRITY_CSUM_NONE: return false; } } /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio * * Normally I/O completion is done in interrupt context. However, verifying I/O * integrity is a time-consuming task which must be run in process context. * * This function postpones completion accordingly. */ bool __bio_integrity_endio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_data *bid = container_of(bip, struct bio_integrity_data, bip); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && bip_should_check(bip)) { INIT_WORK(&bid->work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bid->work); return false; } bio_integrity_finish(bid); return true; } /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * * Checks if the bio already has an integrity payload attached. If it does, the * payload has been generated by another kernel subsystem, and we just pass it * through. * Otherwise allocates integrity payload and for writes the integrity metadata * will be generated. For reads, the completion handler will verify the * metadata. */ bool bio_integrity_prep(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; bool set_flags = true; gfp_t gfp = GFP_NOIO; unsigned int len; void *buf; if (!bi) return true; if (!bio_sectors(bio)) return true; /* Already protected? */ if (bio_integrity(bio)) return true; switch (bio_op(bio)) { case REQ_OP_READ: if (bi->flags & BLK_INTEGRITY_NOVERIFY) { if (bi_offload_capable(bi)) return true; set_flags = false; } break; case REQ_OP_WRITE: /* * Zero the memory allocated to not leak uninitialized kernel * memory to disk for non-integrity metadata where nothing else * initializes the memory. */ if (bi->flags & BLK_INTEGRITY_NOGENERATE) { if (bi_offload_capable(bi)) return true; set_flags = false; gfp |= __GFP_ZERO; } else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE) gfp |= __GFP_ZERO; break; default: return true; } if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return true; /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (!buf) goto err_end_io; bid = mempool_alloc(&bid_pool, GFP_NOIO); if (!bid) goto err_free_buf; bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); bid->bio = bio; bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (set_flags) { if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bid->bip.bip_flags |= BIP_IP_CHECKSUM; if (bi->csum_type) bid->bip.bip_flags |= BIP_CHECK_GUARD; if (bi->flags & BLK_INTEGRITY_REF_TAG) bid->bip.bip_flags |= BIP_CHECK_REFTAG; } if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) goto err_end_io; /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; return true; err_free_buf: kfree(buf); err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; } EXPORT_SYMBOL(bio_integrity_prep); void blk_flush_integrity(void) { flush_workqueue(kintegrityd_wq); } static int __init blk_integrity_auto_init(void) { bid_slab = kmem_cache_create("bio_integrity_data", sizeof(struct bio_integrity_data), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); if (mempool_init_slab_pool(&bid_pool, BIO_POOL_SIZE, bid_slab)) panic("bio: can't create integrity pool\n"); /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. */ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); return 0; } subsys_initcall(blk_integrity_auto_init); |
| 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | // SPDX-License-Identifier: GPL-2.0-only /* * ebtable_filter * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * April, 2002 * */ #include <linux/netfilter_bridge/ebtables.h> #include <uapi/linux/netfilter_bridge.h> #include <linux/module.h> #define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ (1 << NF_BR_LOCAL_OUT)) static struct ebt_entries initial_chains[] = { { .name = "INPUT", .policy = EBT_ACCEPT, }, { .name = "FORWARD", .policy = EBT_ACCEPT, }, { .name = "OUTPUT", .policy = EBT_ACCEPT, }, }; static struct ebt_replace_kernel initial_table = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .entries_size = 3 * sizeof(struct ebt_entries), .hook_entry = { [NF_BR_LOCAL_IN] = &initial_chains[0], [NF_BR_FORWARD] = &initial_chains[1], [NF_BR_LOCAL_OUT] = &initial_chains[2], }, .entries = (char *)initial_chains, }; static const struct ebt_table frame_filter = { .name = "filter", .table = &initial_table, .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, }; static const struct nf_hook_ops ebt_ops_filter[] = { { .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_FILTER_OTHER, }, }; static int frame_filter_table_init(struct net *net) { return ebt_register_table(net, &frame_filter, ebt_ops_filter); } static void __net_exit frame_filter_net_pre_exit(struct net *net) { ebt_unregister_table_pre_exit(net, "filter"); } static void __net_exit frame_filter_net_exit(struct net *net) { ebt_unregister_table(net, "filter"); } static struct pernet_operations frame_filter_net_ops = { .exit = frame_filter_net_exit, .pre_exit = frame_filter_net_pre_exit, }; static int __init ebtable_filter_init(void) { int ret = ebt_register_template(&frame_filter, frame_filter_table_init); if (ret) return ret; ret = register_pernet_subsys(&frame_filter_net_ops); if (ret) { ebt_unregister_template(&frame_filter); return ret; } return 0; } static void __exit ebtable_filter_fini(void) { unregister_pernet_subsys(&frame_filter_net_ops); ebt_unregister_template(&frame_filter); } module_init(ebtable_filter_init); module_exit(ebtable_filter_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ebtables legacy filter table"); |
| 13 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 | // SPDX-License-Identifier: GPL-2.0 /* Shared Memory Communications Direct over ISM devices (SMC-D) * * Functions for ISM device. * * Copyright IBM Corp. 2018 */ #include <linux/if_vlan.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/slab.h> #include <asm/page.h> #include "smc.h" #include "smc_core.h" #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; #if IS_ENABLED(CONFIG_ISM) static void smcd_register_dev(struct ism_dev *ism); static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .add = smcd_register_dev, .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif static void smc_ism_create_system_eid(void) { struct smc_ism_seid *seid = (struct smc_ism_seid *)smc_ism_v2_system_eid; #if IS_ENABLED(CONFIG_S390) struct cpuid id; u16 ident_tail; char tmp[5]; memcpy(seid->seid_string, "IBM-SYSZ-ISMSEID00000000", 24); get_cpu_id(&id); ident_tail = (u16)(id.ident & SMC_ISM_IDENT_MASK); snprintf(tmp, 5, "%04X", ident_tail); memcpy(seid->serial_number, tmp, 4); snprintf(tmp, 5, "%04X", id.machine); memcpy(seid->type, tmp, 4); #else memset(seid, 0, SMC_MAX_EID_LEN); #endif } /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, vlan_id); } void smc_ism_get_system_eid(u8 **eid) { if (!smc_ism_v2_capable) *eid = NULL; else *eid = smc_ism_v2_system_eid; } u16 smc_ism_get_chid(struct smcd_dev *smcd) { return smcd->ops->get_chid(smcd); } /* HW supports ISM V2 and thus System EID is defined */ bool smc_ism_is_v2_capable(void) { return smc_ism_v2_capable; } void smc_ism_set_v2_capable(void) { smc_ism_v2_capable = true; } /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { unsigned long flags; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Unset a connection using this DMBE. */ void smc_ism_unset_conn(struct smc_connection *conn) { unsigned long flags; if (!conn->rmb_desc) return; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Register a VLAN identifier with the ISM device. Use a reference count * and add a VLAN identifier only when the first DMB using this VLAN is * registered. */ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *new_vlan, *vlan; unsigned long flags; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; if (!smcd->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); if (!new_vlan) return -ENOMEM; new_vlan->vlanid = vlanid; refcount_set(&new_vlan->refcnt, 1); /* if there is an existing entry, increase count and return */ spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { refcount_inc(&vlan->refcnt); kfree(new_vlan); goto out; } } /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ if (smcd->ops->add_vlan_id(smcd, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; } list_add_tail(&new_vlan->list, &smcd->vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } /* Unregister a VLAN identifier with the ISM device. Use a reference count * and remove a VLAN identifier only when the last DMB using this VLAN is * unregistered. */ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *vlan; unsigned long flags; bool found = false; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; if (!smcd->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { if (!refcount_dec_and_test(&vlan->refcnt)) goto out; found = true; break; } } if (!found) { rc = -ENOENT; goto out; /* VLAN id not in table */ } /* Found and the last reference just gone */ if (smcd->ops->del_vlan_id(smcd, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc = 0; if (!dmb_desc->dma_addr) return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; dmb.sba_idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; rc = smcd->ops->unregister_dmb(smcd, &dmb); if (!rc || rc == ISM_ERROR) { dmb_desc->cpu_addr = NULL; dmb_desc->dma_addr = 0; } return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid.gid; rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; } return rc; } bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) { /* for now only loopback-ism supports * merging sndbuf with peer DMB to avoid * data copies between them. */ return (smcd->ops->support_dmb_nocopy && smcd->ops->support_dmb_nocopy(smcd)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc = 0; if (!dev->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; rc = dev->ops->attach_dmb(dev, &dmb); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; } return rc; } int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { if (!dev->ops->detach_dmb) return -EINVAL; return dev->ops->detach_dmb(dev, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct sk_buff *skb, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; struct nlattr *attrs; struct ism_dev *ism; int use_cnt = 0; void *nlh; ism = smcd->priv; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); if (!attrs) goto errout; use_cnt = atomic_read(&smcd->lgr_cnt); if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) goto errattr; if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) goto errattr; port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); if (!port_attrs) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) goto errportattr; memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) goto errportattr; nla_nest_end(skb, port_attrs); nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errportattr: nla_nest_cancel(skb, port_attrs); errattr: nla_nest_cancel(skb, attrs); errout: nlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); int snum = cb_ctx->pos[0]; struct smcd_dev *smcd; int num = 0; mutex_lock(&dev_list->mutex); list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; if (smc_ism_is_loopback(smcd)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: num++; } errout: mutex_unlock(&dev_list->mutex); cb_ctx->pos[0] = num; } int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) { smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); return skb->len; } #if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; struct ism_event event; }; #define ISM_EVENT_REQUEST 0x0001 #define ISM_EVENT_RESPONSE 0x0002 #define ISM_EVENT_REQUEST_IR 0x00000001 #define ISM_EVENT_CODE_SHUTDOWN 0x80 #define ISM_EVENT_CODE_TESTLINK 0x83 union smcd_sw_event_info { u64 info; struct { u8 uid[SMC_LGR_ID_SIZE]; unsigned short vlan_id; u16 code; }; }; static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { struct smcd_gid peer_gid = { .gid = wrk->event.tok, .gid_ext = 0 }; union smcd_sw_event_info ev_info; ev_info.info = wrk->event.info; switch (wrk->event.code) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && wrk->smcd->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, &peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_TESTLINK, ev_info.info); } break; } } /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); struct smcd_gid smcd_gid = { .gid = wrk->event.tok, .gid_ext = 0 }; switch (wrk->event.type) { case ISM_EVENT_GID: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; case ISM_EVENT_DMB: break; case ISM_EVENT_SWR: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, const struct smcd_ops *ops, int max_dmbs) { struct smcd_dev *smcd; smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; smcd->conn = devm_kcalloc(parent, max_dmbs, sizeof(struct smc_connection *), GFP_KERNEL); if (!smcd->conn) return NULL; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) return NULL; smcd->ops = ops; spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; } static void smcd_register_dev(struct ism_dev *ism) { const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; if (!ops) return; smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, ISM_NR_DMBS); if (!smcd) return; smcd->priv = ism; smcd->client = &smc_ism_client; ism_set_priv(ism, &smc_ism_client, smcd); if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); if (smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; * - loopback-ism always at the very beginning; */ if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); if (fentry && smc_ism_is_loopback(fentry)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); } else { list_add_tail(&smcd->list, &smcd_dev_list.list); } mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", dev_name(&ism->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); return; } static void smcd_unregister_dev(struct ism_dev *ism) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); pr_warn_ratelimited("smc: removing smcd device %s\n", dev_name(&ism->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); } /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) * - event->time (time of day) * - event->info (debug info). * * Context: * - Function called in IRQ context from ISM device driver event handler. */ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_ism_event_work *wrk; if (smcd->going_away) return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) return; INIT_WORK(&wrk->work, smc_ism_event_work); wrk->smcd = smcd; wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_connection *conn = NULL; unsigned long flags; spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } #endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; #if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; if (lgr->peer_shutdown) return 0; if (!lgr->smcd->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); #endif return rc; } int smc_ism_init(void) { int rc = 0; smc_ism_v2_capable = false; smc_ism_create_system_eid(); #if IS_ENABLED(CONFIG_ISM) rc = ism_register_client(&smc_ism_client); #endif return rc; } void smc_ism_exit(void) { #if IS_ENABLED(CONFIG_ISM) ism_unregister_client(&smc_ism_client); #endif } |
| 2 2 2 2 2 2 2 2 2 2 6 6 6 6 6 6 2 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | /* * linux/drivers/video/fb_sys_read.c - Generic file operations where * framebuffer is in system RAM * * Copyright (C) 2007 Antonino Daplas <adaplas@pol.net> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. * */ #include <linux/fb.h> #include <linux/module.h> #include <linux/uaccess.h> ssize_t fb_sys_read(struct fb_info *info, char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; void *src; int err = 0; unsigned long total_size, c; ssize_t ret; if (!(info->flags & FBINFO_VIRTFB)) fb_warn_once(info, "Framebuffer is not in virtual address space."); if (!info->screen_buffer) return -ENODEV; total_size = info->screen_size; if (total_size == 0) total_size = info->fix.smem_len; if (p >= total_size) return 0; if (count >= total_size) count = total_size; if (count + p > total_size) count = total_size - p; src = info->screen_buffer + p; if (info->fbops->fb_sync) info->fbops->fb_sync(info); c = copy_to_user(buf, src, count); if (c) err = -EFAULT; ret = count - c; *ppos += ret; return ret ? ret : err; } EXPORT_SYMBOL_GPL(fb_sys_read); ssize_t fb_sys_write(struct fb_info *info, const char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; void *dst; int err = 0; unsigned long total_size, c; size_t ret; if (!(info->flags & FBINFO_VIRTFB)) fb_warn_once(info, "Framebuffer is not in virtual address space."); if (!info->screen_buffer) return -ENODEV; total_size = info->screen_size; if (total_size == 0) total_size = info->fix.smem_len; if (p > total_size) return -EFBIG; if (count > total_size) { err = -EFBIG; count = total_size; } if (count + p > total_size) { if (!err) err = -ENOSPC; count = total_size - p; } dst = info->screen_buffer + p; if (info->fbops->fb_sync) info->fbops->fb_sync(info); c = copy_from_user(dst, buf, count); if (c) err = -EFAULT; ret = count - c; *ppos += ret; return ret ? ret : err; } EXPORT_SYMBOL_GPL(fb_sys_write); MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>"); MODULE_DESCRIPTION("Generic file read (fb in system RAM)"); MODULE_LICENSE("GPL"); |
| 9 38 12 12 12 12 12 12 19 15 10 10 10 3 3 20 10 6 10 4 1 1 10 13 2 2 11 11 3 11 6 11 11 15 5 12 12 22 19 21 15 14 14 11 3 2 2 3 1 2 15 22 9 9 6 8 9 6 9 3 9 9 8 8 9 4 9 2 3 5 5 5 3 5 9 41 40 40 37 9 9 9 9 9 5 12 11 11 40 9 3 9 1 7 7 7 1 6 6 25 24 23 21 1 21 17 21 9 19 19 9 19 19 19 10 10 10 10 10 7 9 3 3 10 13 7 8 8 19 21 16 25 25 27 25 20 17 3 1 14 12 11 12 2 4 2 15 9 8 2 1 1 1 3 26 25 23 4 3 1 20 22 22 17 2 17 15 15 7 15 9 15 8 8 7 7 7 7 9 9 9 8 9 9 4 6 6 2 2 14 4 4 11 25 26 30 30 30 30 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 | // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/msg.c * Copyright (C) 1992 Krishna Balasubramanian * * Removed all the remaining kerneld mess * Catch the -EFAULT stuff properly * Use GFP_KERNEL for messages as in 1.2 * Fixed up the unchecked user space derefs * Copyright (C) 1998 Alan Cox & Andi Kleen * * /proc/sysvipc/msg support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * * mostly rewritten, threaded and wake-one semantics added * MSGMAX limit removed, sysctl's added * (c) 1999 Manfred Spraul <manfred@colorfullife.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> */ #include <linux/capability.h> #include <linux/msg.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/list.h> #include <linux/security.h> #include <linux/sched/wake_q.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/percpu_counter.h> #include <asm/current.h> #include <linux/uaccess.h> #include "util.h" /* one msq_queue structure for each present queue on the system */ struct msg_queue { struct kern_ipc_perm q_perm; time64_t q_stime; /* last msgsnd time */ time64_t q_rtime; /* last msgrcv time */ time64_t q_ctime; /* last change time */ unsigned long q_cbytes; /* current number of bytes on queue */ unsigned long q_qnum; /* number of messages in queue */ unsigned long q_qbytes; /* max number of bytes on queue */ struct pid *q_lspid; /* pid of last msgsnd */ struct pid *q_lrpid; /* last receive pid */ struct list_head q_messages; struct list_head q_receivers; struct list_head q_senders; } __randomize_layout; /* * MSG_BARRIER Locking: * * Similar to the optimization used in ipc/mqueue.c, one syscall return path * does not acquire any locks when it sees that a message exists in * msg_receiver.r_msg. Therefore r_msg is set using smp_store_release() * and accessed using READ_ONCE()+smp_acquire__after_ctrl_dep(). In addition, * wake_q_add_safe() is used. See ipc/mqueue.c for more details */ /* one msg_receiver structure for each sleeping receiver */ struct msg_receiver { struct list_head r_list; struct task_struct *r_tsk; int r_mode; long r_msgtype; long r_maxsize; struct msg_msg *r_msg; }; /* one msg_sender for each sleeping sender */ struct msg_sender { struct list_head list; struct task_struct *tsk; size_t msgsz; }; #define SEARCH_ANY 1 #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 #define SEARCH_LESSEQUAL 4 #define SEARCH_NUMBER 5 #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&msg_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) { ipc_rmid(&msg_ids(ns), &s->q_perm); } static void msg_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); struct msg_queue *msq = container_of(p, struct msg_queue, q_perm); security_msg_queue_free(&msq->q_perm); kfree(msq); } /** * newque - Create a new msg queue * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { struct msg_queue *msq; int retval; key_t key = params->key; int msgflg = params->flg; msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT); if (unlikely(!msq)) return -ENOMEM; msq->q_perm.mode = msgflg & S_IRWXUGO; msq->q_perm.key = key; msq->q_perm.security = NULL; retval = security_msg_queue_alloc(&msq->q_perm); if (retval) { kfree(msq); return retval; } msq->q_stime = msq->q_rtime = 0; msq->q_ctime = ktime_get_real_seconds(); msq->q_cbytes = msq->q_qnum = 0; msq->q_qbytes = ns->msg_ctlmnb; msq->q_lspid = msq->q_lrpid = NULL; INIT_LIST_HEAD(&msq->q_messages); INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); /* ipc_addid() locks msq upon success. */ retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (retval < 0) { ipc_rcu_putref(&msq->q_perm, msg_rcu_free); return retval; } ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); return msq->q_perm.id; } static inline bool msg_fits_inqueue(struct msg_queue *msq, size_t msgsz) { return msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes; } static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss, size_t msgsz) { mss->tsk = current; mss->msgsz = msgsz; /* * No memory barrier required: we did ipc_lock_object(), * and the waker obtains that lock before calling wake_q_add(). */ __set_current_state(TASK_INTERRUPTIBLE); list_add_tail(&mss->list, &msq->q_senders); } static inline void ss_del(struct msg_sender *mss) { if (mss->list.next) list_del(&mss->list); } static void ss_wakeup(struct msg_queue *msq, struct wake_q_head *wake_q, bool kill) { struct msg_sender *mss, *t; struct task_struct *stop_tsk = NULL; struct list_head *h = &msq->q_senders; list_for_each_entry_safe(mss, t, h, list) { if (kill) mss->list.next = NULL; /* * Stop at the first task we don't wakeup, * we've already iterated the original * sender queue. */ else if (stop_tsk == mss->tsk) break; /* * We are not in an EIDRM scenario here, therefore * verify that we really need to wakeup the task. * To maintain current semantics and wakeup order, * move the sender to the tail on behalf of the * blocked task. */ else if (!msg_fits_inqueue(msq, mss->msgsz)) { if (!stop_tsk) stop_tsk = mss->tsk; list_move_tail(&mss->list, &msq->q_senders); continue; } wake_q_add(wake_q, mss->tsk); } } static void expunge_all(struct msg_queue *msq, int res, struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { struct task_struct *r_tsk; r_tsk = get_task_struct(msr->r_tsk); /* see MSG_BARRIER for purpose/pairing */ smp_store_release(&msr->r_msg, ERR_PTR(res)); wake_q_add_safe(wake_q, r_tsk); } } /* * freeque() wakes up waiters on the sender and receiver waiting queue, * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * * msg_ids.rwsem (writer) and the spinlock for this message queue are held * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) __releases(RCU) __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); DEFINE_WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); ss_wakeup(msq, &wake_q, true); msg_rmid(ns, msq); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); } long ksys_msgget(key_t key, int msgflg) { struct ipc_namespace *ns; static const struct ipc_ops msg_ops = { .getnew = newque, .associate = security_msg_queue_associate, }; struct ipc_params msg_params; ns = current->nsproxy->ipc_ns; msg_params.key = key; msg_params.flg = msgflg; return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); } SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) { return ksys_msgget(key, msgflg); } static inline unsigned long copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct msqid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->msg_perm, &out.msg_perm); out.msg_stime = in->msg_stime; out.msg_rtime = in->msg_rtime; out.msg_ctime = in->msg_ctime; if (in->msg_cbytes > USHRT_MAX) out.msg_cbytes = USHRT_MAX; else out.msg_cbytes = in->msg_cbytes; out.msg_lcbytes = in->msg_cbytes; if (in->msg_qnum > USHRT_MAX) out.msg_qnum = USHRT_MAX; else out.msg_qnum = in->msg_qnum; if (in->msg_qbytes > USHRT_MAX) out.msg_qbytes = USHRT_MAX; else out.msg_qbytes = in->msg_qbytes; out.msg_lqbytes = in->msg_qbytes; out.msg_lspid = in->msg_lspid; out.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static inline unsigned long copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct msqid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->msg_perm.uid = tbuf_old.msg_perm.uid; out->msg_perm.gid = tbuf_old.msg_perm.gid; out->msg_perm.mode = tbuf_old.msg_perm.mode; if (tbuf_old.msg_qbytes == 0) out->msg_qbytes = tbuf_old.msg_lqbytes; else out->msg_qbytes = tbuf_old.msg_qbytes; return 0; } default: return -EINVAL; } } /* * This function handles some msgctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct ipc64_perm *perm, int msg_qbytes) { struct kern_ipc_perm *ipcp; struct msg_queue *msq; int err; down_write(&msg_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd, perm, msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } msq = container_of(ipcp, struct msg_queue, q_perm); err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: ipc_lock_object(&msq->q_perm); /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; case IPC_SET: { DEFINE_WAKE_Q(wake_q); if (msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; goto out_unlock1; } ipc_lock_object(&msq->q_perm); err = ipc_update_perm(perm, ipcp); if (err) goto out_unlock0; msq->q_qbytes = msg_qbytes; msq->q_ctime = ktime_get_real_seconds(); /* * Sleeping receivers might be excluded by * stricter permissions. */ expunge_all(msq, -EAGAIN, &wake_q); /* * Sleeping senders might be able to send * due to a larger queue size. */ ss_wakeup(msq, &wake_q, false); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); goto out_unlock1; } default: err = -EINVAL; goto out_unlock1; } out_unlock0: ipc_unlock_object(&msq->q_perm); out_unlock1: rcu_read_unlock(); out_up: up_write(&msg_ids(ns).rwsem); return err; } static int msgctl_info(struct ipc_namespace *ns, int msqid, int cmd, struct msginfo *msginfo) { int err; int max_idx; /* * We must not return kernel stack data. * due to padding, it's not enough * to set all member fields. */ err = security_msg_queue_msgctl(NULL, cmd); if (err) return err; memset(msginfo, 0, sizeof(*msginfo)); msginfo->msgmni = ns->msg_ctlmni; msginfo->msgmax = ns->msg_ctlmax; msginfo->msgmnb = ns->msg_ctlmnb; msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo->msgmap = min_t(int, percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); msginfo->msgtql = min_t(int, percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } return (max_idx < 0) ? 0 : max_idx; } static int msgctl_stat(struct ipc_namespace *ns, int msqid, int cmd, struct msqid64_ds *p) { struct msg_queue *msq; int err; memset(p, 0, sizeof(*p)); rcu_read_lock(); if (cmd == MSG_STAT || cmd == MSG_STAT_ANY) { msq = msq_obtain_object(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } } else { /* IPC_STAT */ msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } } /* see comment for SHM_STAT_ANY */ if (cmd == MSG_STAT_ANY) audit_ipc_obj(&msq->q_perm); else { err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; } err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&msq->q_perm); if (!ipc_valid_object(&msq->q_perm)) { ipc_unlock_object(&msq->q_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&msq->q_perm, &p->msg_perm); p->msg_stime = msq->q_stime; p->msg_rtime = msq->q_rtime; p->msg_ctime = msq->q_ctime; #ifndef CONFIG_64BIT p->msg_stime_high = msq->q_stime >> 32; p->msg_rtime_high = msq->q_rtime >> 32; p->msg_ctime_high = msq->q_ctime >> 32; #endif p->msg_cbytes = msq->q_cbytes; p->msg_qnum = msq->q_qnum; p->msg_qbytes = msq->q_qbytes; p->msg_lspid = pid_vnr(msq->q_lspid); p->msg_lrpid = pid_vnr(msq->q_lrpid); if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * MSG_STAT and MSG_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = msq->q_perm.id; } ipc_unlock_object(&msq->q_perm); out_unlock: rcu_read_unlock(); return err; } static long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf, int version) { struct ipc_namespace *ns; struct msqid64_ds msqid64; int err; if (msqid < 0 || cmd < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: case MSG_INFO: { struct msginfo msginfo; err = msgctl_info(ns, msqid, cmd, &msginfo); if (err < 0) return err; if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) err = -EFAULT; return err; } case MSG_STAT: /* msqid is an index rather than a msg queue id */ case MSG_STAT_ANY: case IPC_STAT: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; if (copy_msqid_to_user(buf, &msqid64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_msqid_from_user(&msqid64, buf, version)) return -EFAULT; return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } } SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { return ksys_msgctl(msqid, cmd, buf, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) { int version = ipc_parse_version(&cmd); return ksys_msgctl(msqid, cmd, buf, version); } SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { return ksys_old_msgctl(msqid, cmd, buf); } #endif #ifdef CONFIG_COMPAT struct compat_msqid_ds { struct compat_ipc_perm msg_perm; compat_uptr_t msg_first; compat_uptr_t msg_last; old_time32_t msg_stime; old_time32_t msg_rtime; old_time32_t msg_ctime; compat_ulong_t msg_lcbytes; compat_ulong_t msg_lqbytes; unsigned short msg_cbytes; unsigned short msg_qnum; unsigned short msg_qbytes; compat_ipc_pid_t msg_lspid; compat_ipc_pid_t msg_lrpid; }; static int copy_compat_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_msqid64_ds __user *p = buf; if (get_compat_ipc64_perm(&out->msg_perm, &p->msg_perm)) return -EFAULT; if (get_user(out->msg_qbytes, &p->msg_qbytes)) return -EFAULT; } else { struct compat_msqid_ds __user *p = buf; if (get_compat_ipc_perm(&out->msg_perm, &p->msg_perm)) return -EFAULT; if (get_user(out->msg_qbytes, &p->msg_qbytes)) return -EFAULT; } return 0; } static int copy_compat_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) { if (version == IPC_64) { struct compat_msqid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.msg_perm, &in->msg_perm); v.msg_stime = lower_32_bits(in->msg_stime); v.msg_stime_high = upper_32_bits(in->msg_stime); v.msg_rtime = lower_32_bits(in->msg_rtime); v.msg_rtime_high = upper_32_bits(in->msg_rtime); v.msg_ctime = lower_32_bits(in->msg_ctime); v.msg_ctime_high = upper_32_bits(in->msg_ctime); v.msg_cbytes = in->msg_cbytes; v.msg_qnum = in->msg_qnum; v.msg_qbytes = in->msg_qbytes; v.msg_lspid = in->msg_lspid; v.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_msqid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.msg_perm, &in->msg_perm); v.msg_stime = in->msg_stime; v.msg_rtime = in->msg_rtime; v.msg_ctime = in->msg_ctime; v.msg_cbytes = in->msg_cbytes; v.msg_qnum = in->msg_qnum; v.msg_qbytes = in->msg_qbytes; v.msg_lspid = in->msg_lspid; v.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &v, sizeof(v)); } } static long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; int err; struct msqid64_ds msqid64; ns = current->nsproxy->ipc_ns; if (msqid < 0 || cmd < 0) return -EINVAL; switch (cmd & (~IPC_64)) { case IPC_INFO: case MSG_INFO: { struct msginfo msginfo; err = msgctl_info(ns, msqid, cmd, &msginfo); if (err < 0) return err; if (copy_to_user(uptr, &msginfo, sizeof(struct msginfo))) err = -EFAULT; return err; } case IPC_STAT: case MSG_STAT: case MSG_STAT_ANY: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; if (copy_compat_msqid_to_user(uptr, &msqid64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_compat_msqid_from_user(&msqid64, uptr, version)) return -EFAULT; return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } } COMPAT_SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, void __user *, uptr) { return compat_ksys_msgctl(msqid, cmd, uptr, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_msgctl(msqid, cmd, uptr, version); } COMPAT_SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, void __user *, uptr) { return compat_ksys_old_msgctl(msqid, cmd, uptr); } #endif #endif static int testmsg(struct msg_msg *msg, long type, int mode) { switch (mode) { case SEARCH_ANY: case SEARCH_NUMBER: return 1; case SEARCH_LESSEQUAL: if (msg->m_type <= type) return 1; break; case SEARCH_EQUAL: if (msg->m_type == type) return 1; break; case SEARCH_NOTEQUAL: if (msg->m_type != type) return 1; break; } return 0; } static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg, struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { if (testmsg(msg, msr->r_msgtype, msr->r_mode) && !security_msg_queue_msgrcv(&msq->q_perm, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { wake_q_add(wake_q, msr->r_tsk); /* See expunge_all regarding memory barrier */ smp_store_release(&msr->r_msg, ERR_PTR(-E2BIG)); } else { ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk)); msq->q_rtime = ktime_get_real_seconds(); wake_q_add(wake_q, msr->r_tsk); /* See expunge_all regarding memory barrier */ smp_store_release(&msr->r_msg, msg); return 1; } } } return 0; } static long do_msgsnd(int msqid, long mtype, void __user *mtext, size_t msgsz, int msgflg) { struct msg_queue *msq; struct msg_msg *msg; int err; struct ipc_namespace *ns; DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0) return -EINVAL; if (mtype < 1) return -EINVAL; msg = load_msg(mtext, msgsz); if (IS_ERR(msg)) return PTR_ERR(msg); msg->m_type = mtype; msg->m_ts = msgsz; rcu_read_lock(); msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock1; } ipc_lock_object(&msq->q_perm); for (;;) { struct msg_sender s; err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) goto out_unlock0; /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } err = security_msg_queue_msgsnd(&msq->q_perm, msg, msgflg); if (err) goto out_unlock0; if (msg_fits_inqueue(msq, msgsz)) break; /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; goto out_unlock0; } /* enqueue the sender and prepare to block */ ss_add(msq, &s, msgsz); if (!ipc_rcu_getref(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); schedule(); rcu_read_lock(); ipc_lock_object(&msq->q_perm); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } ss_del(&s); if (signal_pending(current)) { err = -ERESTARTNOHAND; goto out_unlock0; } } ipc_update_pid(&msq->q_lspid, task_tgid(current)); msq->q_stime = ktime_get_real_seconds(); if (!pipelined_send(msq, msg, &wake_q)) { /* no one is waiting for this message, enqueue it */ list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; msg = NULL; out_unlock0: ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (msg != NULL) free_msg(msg); return err; } long ksys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg) { long mtype; if (get_user(mtype, &msgp->mtype)) return -EFAULT; return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg); } SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, int, msgflg) { return ksys_msgsnd(msqid, msgp, msgsz, msgflg); } #ifdef CONFIG_COMPAT struct compat_msgbuf { compat_long_t mtype; char mtext[]; }; long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg) { struct compat_msgbuf __user *up = compat_ptr(msgp); compat_long_t mtype; if (get_user(mtype, &up->mtype)) return -EFAULT; return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg); } COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp, compat_ssize_t, msgsz, int, msgflg) { return compat_ksys_msgsnd(msqid, msgp, msgsz, msgflg); } #endif static inline int convert_mode(long *msgtyp, int msgflg) { if (msgflg & MSG_COPY) return SEARCH_NUMBER; /* * find message of correct type. * msgtyp = 0 => get first. * msgtyp > 0 => get first message of matching type. * msgtyp < 0 => get message with least type must be < abs(msgtype). */ if (*msgtyp == 0) return SEARCH_ANY; if (*msgtyp < 0) { if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ *msgtyp = LONG_MAX; else *msgtyp = -*msgtyp; return SEARCH_LESSEQUAL; } if (msgflg & MSG_EXCEPT) return SEARCH_NOTEQUAL; return SEARCH_EQUAL; } static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) { struct msgbuf __user *msgp = dest; size_t msgsz; if (put_user(msg->m_type, &msgp->mtype)) return -EFAULT; msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; if (store_msg(msgp->mtext, msg, msgsz)) return -EFAULT; return msgsz; } #ifdef CONFIG_CHECKPOINT_RESTORE /* * This function creates new kernel message structure, large enough to store * bufsz message bytes. */ static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { struct msg_msg *copy; /* * Create dummy message to copy real message to. */ copy = load_msg(buf, bufsz); if (!IS_ERR(copy)) copy->m_ts = bufsz; return copy; } static inline void free_copy(struct msg_msg *copy) { if (copy) free_msg(copy); } #else static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { return ERR_PTR(-ENOSYS); } static inline void free_copy(struct msg_msg *copy) { } #endif static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) { struct msg_msg *msg, *found = NULL; long count = 0; list_for_each_entry(msg, &msq->q_messages, m_list) { if (testmsg(msg, *msgtyp, mode) && !security_msg_queue_msgrcv(&msq->q_perm, msg, current, *msgtyp, mode)) { if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) { *msgtyp = msg->m_type - 1; found = msg; } else if (mode == SEARCH_NUMBER) { if (*msgtyp == count) return msg; } else return msg; count++; } } return found ?: ERR_PTR(-EAGAIN); } static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { int mode; struct msg_queue *msq; struct ipc_namespace *ns; struct msg_msg *msg, *copy = NULL; DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; if (msgflg & MSG_COPY) { if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT)) return -EINVAL; copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) return PTR_ERR(copy); } mode = convert_mode(&msgtyp, msgflg); rcu_read_lock(); msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { rcu_read_unlock(); free_copy(copy); return PTR_ERR(msq); } for (;;) { struct msg_receiver msr_d; msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock1; ipc_lock_object(&msq->q_perm); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { msg = ERR_PTR(-EIDRM); goto out_unlock0; } msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* * Found a suitable message. * Unlink it from the queue. */ if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); goto out_unlock0; } /* * If we are copying, then do not unlink message and do * not update queue parameters. */ if (msgflg & MSG_COPY) { msg = copy_msg(msg, copy); goto out_unlock0; } list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; } /* No message waiting. Wait for a message */ if (msgflg & IPC_NOWAIT) { msg = ERR_PTR(-ENOMSG); goto out_unlock0; } list_add_tail(&msr_d.r_list, &msq->q_receivers); msr_d.r_tsk = current; msr_d.r_msgtype = msgtyp; msr_d.r_mode = mode; if (msgflg & MSG_NOERROR) msr_d.r_maxsize = INT_MAX; else msr_d.r_maxsize = bufsz; /* memory barrier not require due to ipc_lock_object() */ WRITE_ONCE(msr_d.r_msg, ERR_PTR(-EAGAIN)); /* memory barrier not required, we own ipc_lock_object() */ __set_current_state(TASK_INTERRUPTIBLE); ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); schedule(); /* * Lockless receive, part 1: * We don't hold a reference to the queue and getting a * reference would defeat the idea of a lockless operation, * thus the code relies on rcu to guarantee the existence of * msq: * Prior to destruction, expunge_all(-EIRDM) changes r_msg. * Thus if r_msg is -EAGAIN, then the queue not yet destroyed. */ rcu_read_lock(); /* * Lockless receive, part 2: * The work in pipelined_send() and expunge_all(): * - Set pointer to message * - Queue the receiver task for later wakeup * - Wake up the process after the lock is dropped. * * Should the process wake up before this wakeup (due to a * signal) it will either see the message and continue ... */ msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) { /* see MSG_BARRIER for purpose/pairing */ smp_acquire__after_ctrl_dep(); goto out_unlock1; } /* * ... or see -EAGAIN, acquire the lock to check the message * again. */ ipc_lock_object(&msq->q_perm); msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) goto out_unlock0; list_del(&msr_d.r_list); if (signal_pending(current)) { msg = ERR_PTR(-ERESTARTNOHAND); goto out_unlock0; } ipc_unlock_object(&msq->q_perm); } out_unlock0: ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (IS_ERR(msg)) { free_copy(copy); return PTR_ERR(msg); } bufsz = msg_handler(buf, msg, bufsz); free_msg(msg); return bufsz; } long ksys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, long msgtyp, int msgflg) { return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); } SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, long, msgtyp, int, msgflg) { return ksys_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg); } #ifdef CONFIG_COMPAT static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) { struct compat_msgbuf __user *msgp = dest; size_t msgsz; if (put_user(msg->m_type, &msgp->mtype)) return -EFAULT; msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; if (store_msg(msgp->mtext, msg, msgsz)) return -EFAULT; return msgsz; } long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg) { return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp, msgflg, compat_do_msg_fill); } COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg) { return compat_ksys_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg); } #endif int msg_init_ns(struct ipc_namespace *ns) { int ret; ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); if (ret) goto fail_msg_bytes; ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); if (ret) goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); return 0; fail_msg_hdrs: percpu_counter_destroy(&ns->percpu_msg_bytes); fail_msg_bytes: return ret; } #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); percpu_counter_destroy(&ns->percpu_msg_bytes); percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it) { struct pid_namespace *pid_ns = ipc_seq_pid_ns(s); struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); seq_printf(s, "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n", msq->q_perm.key, msq->q_perm.id, msq->q_perm.mode, msq->q_cbytes, msq->q_qnum, pid_nr_ns(msq->q_lspid, pid_ns), pid_nr_ns(msq->q_lrpid, pid_ns), from_kuid_munged(user_ns, msq->q_perm.uid), from_kgid_munged(user_ns, msq->q_perm.gid), from_kuid_munged(user_ns, msq->q_perm.cuid), from_kgid_munged(user_ns, msq->q_perm.cgid), msq->q_stime, msq->q_rtime, msq->q_ctime); return 0; } #endif void __init msg_init(void) { msg_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", IPC_MSG_IDS, sysvipc_msg_proc_show); } |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Red Hat. All rights reserved. */ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/math64.h> #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> #include <linux/string_choices.h> #include "extent-tree.h" #include "fs.h" #include "messages.h" #include "misc.h" #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" #include "extent_io.h" #include "space-info.h" #include "block-group.h" #include "discard.h" #include "subpage.h" #include "inode-item.h" #include "accessors.h" #include "file-item.h" #include "file.h" #include "super.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K #define FORCE_EXTENT_THRESHOLD SZ_1M static struct kmem_cache *btrfs_free_space_cachep; static struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_trim_range { u64 start; u64 bytes; struct list_head list; }; static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat); static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc); static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info); static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); static void btrfs_crc32c_final(u32 crc, u8 *result) { put_unaligned_le32(~crc, result); } static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; struct rb_node *node; while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); if (!info->bitmap) { unlink_free_space(ctl, info, true); kmem_cache_free(btrfs_free_space_cachep, info); } else { free_bitmap(ctl, info); } cond_resched_lock(&ctl->tree_lock); } } static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) { struct btrfs_key key; struct btrfs_key location; struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return ERR_PTR(ret); if (ret > 0) { btrfs_release_path(path); return ERR_PTR(-ENOENT); } leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_free_space_key(leaf, header, &disk_key); btrfs_disk_key_to_cpu(&location, &disk_key); btrfs_release_path(path); /* * We are often under a trans handle at this point, so we need to make * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); inode = btrfs_iget_path(location.objectid, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) return ERR_CAST(inode); mapping_set_gfp_mask(inode->vfs_inode.i_mapping, mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct inode *inode = NULL; u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); if (block_group->inode) inode = igrab(&block_group->inode->vfs_inode); spin_unlock(&block_group->lock); if (inode) return inode; inode = __lookup_free_space_inode(fs_info->tree_root, path, block_group->start); if (IS_ERR(inode)) return inode; spin_lock(&block_group->lock); if (!((BTRFS_I(inode)->flags & flags) == flags)) { btrfs_info(fs_info, "Old style space inode found, converting."); BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; } if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) block_group->inode = BTRFS_I(igrab(inode)); spin_unlock(&block_group->lock); return inode; } static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 ino, u64 offset) { struct btrfs_key key; struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; /* We inline CRCs for the free disk space cache */ const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); btrfs_item_key(leaf, &disk_key, path->slots[0]); memzero_extent_buffer(leaf, (unsigned long)inode_item, sizeof(*inode_item)); btrfs_set_inode_generation(leaf, inode_item, trans->transid); btrfs_set_inode_size(leaf, inode_item, 0); btrfs_set_inode_nbytes(leaf, inode_item, 0); btrfs_set_inode_uid(leaf, inode_item, 0); btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, inode_item, flags); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { btrfs_release_path(path); return ret; } leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); btrfs_release_path(path); return 0; } int create_free_space_inode(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { int ret; u64 ino; ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; return __create_free_space_inode(trans->fs_info->tree_root, trans, path, ino, block_group->start); } /* * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode * handles lookup, otherwise it takes ownership and iputs the inode. * Don't reuse an inode pointer after passing it into this function. */ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (!inode) inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } clear_nlink(inode); /* One for the block groups ref */ spin_lock(&block_group->lock); if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { block_group->inode = NULL; spin_unlock(&block_group->lock); iput(inode); } else { spin_unlock(&block_group->lock); } /* One for the lookup ref */ btrfs_add_delayed_iput(BTRFS_I(inode)); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; key.offset = block_group->start; ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, -1, 1); if (ret) { if (ret > 0) ret = 0; return ret; } return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *vfs_inode) { struct btrfs_truncate_control control = { .inode = BTRFS_I(vfs_inode), .new_size = 0, .ino = btrfs_ino(BTRFS_I(vfs_inode)), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL; int ret = 0; bool locked = false; if (block_group) { BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; } locked = true; mutex_lock(&trans->transaction->cache_write_mutex); if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); btrfs_wait_cache_io(trans, block_group, path); btrfs_put_block_group(block_group); } /* * now that we've truncated the cache away, its no longer * setup or written */ spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); } btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* * We skip the throttling logic for free space cache inodes, so we don't * need to check for -EAGAIN. */ ret = btrfs_truncate_inode_items(trans, root, &control); inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; ret = btrfs_update_inode(trans, inode); fail: if (locked) mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, ret); return ret; } static void readahead_cache(struct inode *inode) { struct file_ra_state ra; unsigned long last_index; file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); } static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, int write) { int num_pages; num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); /* Make sure we can fit our crcs and generation into the first page */ if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) return -ENOSPC; memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; io_ctl->num_pages = num_pages; io_ctl->fs_info = inode_to_fs_info(inode); io_ctl->inode = inode; return 0; } ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { kfree(io_ctl->pages); io_ctl->pages = NULL; } static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) { if (io_ctl->cur) { io_ctl->cur = NULL; io_ctl->orig = NULL; } } static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) { ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_SIZE; if (clear) clear_page(io_ctl->cur); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) { int i; io_ctl_unmap_page(io_ctl); for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { btrfs_folio_clear_checked(io_ctl->fs_info, page_folio(io_ctl->pages[i]), page_offset(io_ctl->pages[i]), PAGE_SIZE); unlock_page(io_ctl->pages[i]); put_page(io_ctl->pages[i]); } } } static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; for (i = 0; i < io_ctl->num_pages; i++) { int ret; folio = __filemap_get_folio(inode->i_mapping, i, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); return PTR_ERR(folio); } ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); io_ctl_drop_pages(io_ctl); return ret; } io_ctl->pages[i] = &folio->page; if (uptodate && !folio_test_uptodate(folio)) { btrfs_read_folio(NULL, folio); folio_lock(folio); if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); return -EIO; } } } for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); return 0; } static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { io_ctl_map_page(io_ctl, 1); /* * Skip the csum areas. If we don't check crcs then we just have a * 64bit chunk at the front of the first page. */ io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit * chunk at the front of the first page. */ io_ctl->cur += sizeof(u32) * io_ctl->num_pages; io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); cache_gen = get_unaligned_le64(io_ctl->cur); if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } io_ctl->cur += sizeof(u64); return 0; } static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp; u32 crc = ~(u32)0; unsigned offset = 0; if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; *tmp = crc; } static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp, val; u32 crc = ~(u32)0; unsigned offset = 0; if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; tmp = page_address(io_ctl->pages[0]); tmp += index; val = *tmp; io_ctl_map_page(io_ctl, 0); crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); io_ctl_unmap_page(io_ctl); return -EIO; } return 0; } static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, void *bitmap) { struct btrfs_free_space_entry *entry; if (!io_ctl->cur) return -ENOSPC; entry = io_ctl->cur; put_unaligned_le64(offset, &entry->offset); put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) return 0; io_ctl_set_crc(io_ctl, io_ctl->index - 1); /* No more pages to map */ if (io_ctl->index >= io_ctl->num_pages) return 0; /* map the next page */ io_ctl_map_page(io_ctl, 1); return 0; } static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) { if (!io_ctl->cur) return -ENOSPC; /* * If we aren't at the start of the current page, unmap this one and * map the next one if there is any left. */ if (io_ctl->cur != io_ctl->orig) { io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index >= io_ctl->num_pages) return -ENOSPC; io_ctl_map_page(io_ctl, 0); } copy_page(io_ctl->cur, bitmap); io_ctl_set_crc(io_ctl, io_ctl->index - 1); if (io_ctl->index < io_ctl->num_pages) io_ctl_map_page(io_ctl, 0); return 0; } static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) { /* * If we're not on the boundary we know we've modified the page and we * need to crc the page. */ if (io_ctl->cur != io_ctl->orig) io_ctl_set_crc(io_ctl, io_ctl->index - 1); else io_ctl_unmap_page(io_ctl); while (io_ctl->index < io_ctl->num_pages) { io_ctl_map_page(io_ctl, 1); io_ctl_set_crc(io_ctl, io_ctl->index - 1); } } static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; int ret; if (!io_ctl->cur) { ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; } e = io_ctl->cur; entry->offset = get_unaligned_le64(&e->offset); entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) return 0; io_ctl_unmap_page(io_ctl); return 0; } static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry) { int ret; ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; copy_page(entry->bitmap, io_ctl->cur); io_ctl_unmap_page(io_ctl); return 0; } static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) { struct btrfs_block_group *block_group = ctl->block_group; u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->length; u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); max_bitmaps = max_t(u64, max_bitmaps, 1); if (ctl->total_bitmaps > max_bitmaps) btrfs_err(block_group->fs_info, "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", block_group->start, block_group->length, ctl->total_bitmaps, ctl->unit, max_bitmaps, bytes_per_bg); ASSERT(ctl->total_bitmaps <= max_bitmaps); /* * We are trying to keep the total amount of memory used per 1GiB of * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of * bitmaps, we may end up using more memory than this. */ if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); bitmap_bytes = ctl->total_bitmaps * ctl->unit; /* * we want the extent entry threshold to always be at most 1/2 the max * bytes we can have, or whatever is less than that. */ extent_bytes = max_bytes - bitmap_bytes; extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); ctl->extents_thresh = div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct btrfs_io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; LIST_HEAD(bitmaps); u64 num_entries; u64 num_bitmaps; u64 generation; u8 type; int ret = 0; /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) return 0; else if (ret > 0) { btrfs_release_path(path); return 0; } ret = -1; leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); num_entries = btrfs_free_space_entries(leaf, header); num_bitmaps = btrfs_free_space_bitmaps(leaf, header); generation = btrfs_free_space_generation(leaf, header); btrfs_release_path(path); if (!BTRFS_I(inode)->generation) { btrfs_info(fs_info, "the free space cache file (%llu) is invalid, skip it", offset); return 0; } if (BTRFS_I(inode)->generation != generation) { btrfs_err(fs_info, "free space inode generation (%llu) did not match free space cache generation (%llu)", BTRFS_I(inode)->generation, generation); return 0; } if (!num_entries) return 0; ret = io_ctl_init(&io_ctl, inode, 0); if (ret) return ret; readahead_cache(inode); ret = io_ctl_prepare_pages(&io_ctl, true); if (ret) goto out; ret = io_ctl_check_crc(&io_ctl, 0); if (ret) goto free_cache; ret = io_ctl_check_generation(&io_ctl, generation); if (ret) goto free_cache; while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!e) { ret = -ENOMEM; goto free_cache; } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } if (!e->bytes) { ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } if (type == BTRFS_FREE_SPACE_EXTENT) { spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); spin_unlock(&ctl->tree_lock); if (ret) { btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } } else { ASSERT(num_bitmaps); num_bitmaps--; e->bitmap = kmem_cache_zalloc( btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; } spin_lock(&ctl->tree_lock); ret = link_free_space(ctl, e); if (ret) { spin_unlock(&ctl->tree_lock); btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } ctl->total_bitmaps++; recalculate_thresholds(ctl); spin_unlock(&ctl->tree_lock); list_add_tail(&e->list, &bitmaps); } num_entries--; } io_ctl_unmap_page(&io_ctl); /* * We add the bitmaps at the end of the entries in order that * the bitmap entries are added to the cache. */ list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; } io_ctl_drop_pages(&io_ctl); ret = 1; out: io_ctl_free(&io_ctl); return ret; free_cache: io_ctl_drop_pages(&io_ctl); spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache(ctl); spin_unlock(&ctl->tree_lock); goto out; } static int copy_free_space_cache(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; struct rb_node *n; int ret = 0; while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { const u64 offset = info->offset; const u64 bytes = info->bytes; unlink_free_space(ctl, info, true); spin_unlock(&ctl->tree_lock); kmem_cache_free(btrfs_free_space_cachep, info); ret = btrfs_add_free_space(block_group, offset, bytes); spin_lock(&ctl->tree_lock); } else { u64 offset = info->offset; u64 bytes = ctl->unit; ret = search_bitmap(ctl, info, &offset, &bytes, false); if (ret == 0) { bitmap_clear_bits(ctl, info, offset, bytes, true); spin_unlock(&ctl->tree_lock); ret = btrfs_add_free_space(block_group, offset, bytes); spin_lock(&ctl->tree_lock); } else { free_bitmap(ctl, info); ret = 0; } } cond_resched_lock(&ctl->tree_lock); } return ret; } static struct lock_class_key btrfs_free_space_inode_key; int load_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space_ctl tmp_ctl = {}; struct inode *inode; struct btrfs_path *path; int ret = 0; bool matched; u64 used = block_group->used; /* * Because we could potentially discard our loaded free space, we want * to load everything into a temporary structure first, and then if it's * valid copy it all into the actual free space ctl. */ btrfs_init_free_space_ctl(block_group, &tmp_ctl); /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. */ spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); return 0; } spin_unlock(&block_group->lock); path = btrfs_alloc_path(); if (!path) return 0; path->search_commit_root = 1; path->skip_locking = 1; /* * We must pass a path with search_commit_root set to btrfs_iget in * order to avoid a deadlock when allocating extents for the tree root. * * When we are COWing an extent buffer from the tree root, when looking * for a free extent, at extent-tree.c:find_free_extent(), we can find * block group without its free space cache loaded. When we find one * we must load its space cache which requires reading its free space * cache's inode item from the root tree. If this inode item is located * in the same leaf that we started COWing before, then we end up in * deadlock on the extent buffer (trying to read lock it when we * previously write locked it). * * It's safe to read the inode item using the commit root because * block groups, once loaded, stay in memory forever (until they are * removed) as well as their space caches once loaded. New block groups * once created get their ->cached field set to BTRFS_CACHE_FINISHED so * we will never try to read their inode item while the fs is mounted. */ inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) { btrfs_free_path(path); return 0; } /* We may have converted the inode and made the cache invalid. */ spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); btrfs_free_path(path); goto out; } spin_unlock(&block_group->lock); /* * Reinitialize the class of struct inode's mapping->invalidate_lock for * free space inodes to prevent false positives related to locks for normal * inodes. */ lockdep_set_class(&(&inode->i_data)->invalidate_lock, &btrfs_free_space_inode_key); ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, path, block_group->start); btrfs_free_path(path); if (ret <= 0) goto out; matched = (tmp_ctl.free_space == (block_group->length - used - block_group->bytes_super)); if (matched) { spin_lock(&tmp_ctl.tree_lock); ret = copy_free_space_cache(block_group, &tmp_ctl); spin_unlock(&tmp_ctl.tree_lock); /* * ret == 1 means we successfully loaded the free space cache, * so we need to re-set it here. */ if (ret == 0) ret = 1; } else { /* * We need to call the _locked variant so we don't try to update * the discard counters. */ spin_lock(&tmp_ctl.tree_lock); __btrfs_remove_free_space_cache(&tmp_ctl); spin_unlock(&tmp_ctl.tree_lock); btrfs_warn(fs_info, "block group %llu has wrong amount of free space", block_group->start); ret = -1; } out: if (ret < 0) { /* This cache is bogus, make sure it gets cleared */ spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); ret = 0; btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now", block_group->start); } spin_lock(&ctl->tree_lock); btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); iput(inode); return ret; } static noinline_for_stack int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, int *entries, int *bitmaps, struct list_head *bitmap_list) { int ret; struct btrfs_free_cluster *cluster = NULL; struct btrfs_free_cluster *cluster_locked = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); struct btrfs_trim_range *trim_entry; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { cluster = list_first_entry(&block_group->cluster_list, struct btrfs_free_cluster, block_group_list); } if (!node && cluster) { cluster_locked = cluster; spin_lock(&cluster_locked->lock); node = rb_first(&cluster->root); cluster = NULL; } /* Write out the extent entries */ while (node) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); *entries += 1; ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, e->bitmap); if (ret) goto fail; if (e->bitmap) { list_add_tail(&e->list, bitmap_list); *bitmaps += 1; } node = rb_next(node); if (!node && cluster) { node = rb_first(&cluster->root); cluster_locked = cluster; spin_lock(&cluster_locked->lock); cluster = NULL; } } if (cluster_locked) { spin_unlock(&cluster_locked->lock); cluster_locked = NULL; } /* * Make sure we don't miss any range that was removed from our rbtree * because trimming is running. Otherwise after a umount+mount (or crash * after committing the transaction) we would leak free space and get * an inconsistent free space cache report from fsck. */ list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { ret = io_ctl_add_entry(io_ctl, trim_entry->start, trim_entry->bytes, NULL); if (ret) goto fail; *entries += 1; } return 0; fail: if (cluster_locked) spin_unlock(&cluster_locked->lock); return -ENOSPC; } static noinline_for_stack int update_cache_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 offset, int entries, int bitmaps) { struct btrfs_key key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; if (ret > 0) { struct btrfs_key found_key; ASSERT(path->slots[0]); path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); btrfs_release_path(path); goto fail; } } BTRFS_I(inode)->generation = trans->transid; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); btrfs_release_path(path); return 0; fail: return -1; } static noinline_for_stack int write_pinned_extent_entries( struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, int *entries) { u64 start, extent_start, extent_end, len; struct extent_io_tree *unpin = NULL; int ret; if (!block_group) return 0; /* * We want to add any pinned extents to our free space cache * so we don't leak the space * * We shouldn't have switched the pinned extents yet so this is the * right one */ unpin = &trans->transaction->pinned_extents; start = block_group->start; while (start < block_group->start + block_group->length) { if (!btrfs_find_first_extent_bit(unpin, start, &extent_start, &extent_end, EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ if (extent_start >= block_group->start + block_group->length) return 0; extent_start = max(extent_start, start); extent_end = min(block_group->start + block_group->length, extent_end + 1); len = extent_end - extent_start; *entries += 1; ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); if (ret) return -ENOSPC; start = extent_end; } return 0; } static noinline_for_stack int write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { struct btrfs_free_space *entry, *next; int ret; /* Write out the bitmaps */ list_for_each_entry_safe(entry, next, bitmap_list, list) { ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) return -ENOSPC; list_del_init(&entry->list); } return 0; } static int flush_dirty_cache(struct inode *inode) { int ret; ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, EXTENT_DELALLOC, NULL); return ret; } static void noinline_for_stack cleanup_bitmap_list(struct list_head *bitmap_list) { struct btrfs_free_space *entry, *next; list_for_each_entry_safe(entry, next, bitmap_list, list) list_del_init(&entry->list); } static void noinline_for_stack cleanup_write_cache_enospc(struct inode *inode, struct btrfs_io_ctl *io_ctl, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_path *path, u64 offset) { int ret; struct inode *inode = io_ctl->inode; if (!inode) return 0; /* Flush the dirty pages in the cache file. */ ret = flush_dirty_cache(inode); if (ret) goto out; /* Update the cache item to tell everyone this cache file is valid. */ ret = update_cache_item(trans, root, inode, path, offset, io_ctl->entries, io_ctl->bitmaps); out: if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; if (block_group) btrfs_debug(root->fs_info, "failed to write free space cache for block group %llu error %d", block_group->start, ret); } btrfs_update_inode(trans, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ spin_lock(&trans->transaction->dirty_bgs_lock); /* the disk_cache_state is protected by the block group lock */ spin_lock(&block_group->lock); /* * only mark this as written if we didn't get put back on * the dirty list while waiting for IO. Otherwise our * cache state won't be right, and we won't get written again */ if (!ret && list_empty(&block_group->dirty_list)) block_group->disk_cache_state = BTRFS_DC_WRITTEN; else if (ret) block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); spin_unlock(&trans->transaction->dirty_bgs_lock); io_ctl->inode = NULL; iput(inode); } return ret; } int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, block_group, &block_group->io_ctl, path, block_group->start); } /* * Write out cached info to an inode. * * @inode: freespace inode we are writing out * @ctl: free space cache we are going to write out * @block_group: block_group for this cache if it belongs to a block_group * @io_ctl: holds context for the io * @trans: the trans handle * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans) { struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; int bitmaps = 0; int ret; int must_iput = 0; int i_size; if (!i_size_read(inode)) return -EIO; WARN_ON(io_ctl->pages); ret = io_ctl_init(io_ctl, inode, 1); if (ret) return ret; if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { down_write(&block_group->data_rwsem); spin_lock(&block_group->lock); if (block_group->delalloc_bytes) { block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); up_write(&block_group->data_rwsem); BTRFS_I(inode)->generation = 0; ret = 0; must_iput = 1; goto out; } spin_unlock(&block_group->lock); } /* Lock all pages first so we can lock the extent safely. */ ret = io_ctl_prepare_pages(io_ctl, false); if (ret) goto out_unlock; btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); io_ctl_set_generation(io_ctl, trans->transid); mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ spin_lock(&ctl->tree_lock); ret = write_cache_extent_entries(io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); if (ret) goto out_nospc_locked; /* * Some spaces that are freed in the current transaction are pinned, * they will be added into free space cache after the transaction is * committed, we shouldn't lose them. * * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); if (ret) goto out_nospc_locked; /* * At last, we write out all the bitmaps and keep cache_writeout_mutex * locked while doing it because a concurrent trim can be manipulating * or freeing the bitmap. */ ret = write_bitmap_entries(io_ctl, &bitmap_list); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; /* Zero out the rest of the pages just to make sure */ io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ i_size = i_size_read(inode); for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { u64 dirty_start = i * PAGE_SIZE; u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), dirty_start, dirty_len, &cached_state, false); if (ret < 0) goto out_nospc; } if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); /* * Release the pages and unlock the extent, we will flush * them out later */ io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); /* * at this point the pages are under IO and we're happy, * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; io_ctl->bitmaps = bitmaps; ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); if (ret) goto out; return 0; out_nospc_locked: cleanup_bitmap_list(&bitmap_list); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state); out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); out: io_ctl->inode = NULL; io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; } int btrfs_write_out_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; spin_lock(&block_group->lock); if (block_group->disk_cache_state < BTRFS_DC_SETUP) { spin_unlock(&block_group->lock); return 0; } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode)) return 0; ret = __btrfs_write_out_cache(inode, ctl, block_group, &block_group->io_ctl, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", block_group->start, ret); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); block_group->io_ctl.inode = NULL; iput(inode); } /* * if ret == 0 the caller is expected to call btrfs_wait_cache_io * to wait for IO and put the inode */ return ret; } static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, u64 offset) { ASSERT(offset >= bitmap_start); offset -= bitmap_start; return (unsigned long)(div_u64(offset, unit)); } static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) { return (unsigned long)(div_u64(bytes, unit)); } static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; u64 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; return bitmap_start; } static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, struct btrfs_free_cluster *cluster, struct btrfs_free_space *new_entry) { struct rb_root *root; struct rb_node **p; struct rb_node *parent = NULL; lockdep_assert_held(&ctl->tree_lock); if (cluster) { lockdep_assert_held(&cluster->lock); root = &cluster->root; } else { root = &ctl->free_space_offset; } p = &root->rb_node; while (*p) { struct btrfs_free_space *info; parent = *p; info = rb_entry(parent, struct btrfs_free_space, offset_index); if (new_entry->offset < info->offset) { p = &(*p)->rb_left; } else if (new_entry->offset > info->offset) { p = &(*p)->rb_right; } else { /* * we could have a bitmap entry and an extent entry * share the same offset. If this is the case, we want * the extent entry to always be found first if we do a * linear search through the tree, since we want to have * the quickest allocation time, and allocating from an * extent is faster than allocating from a bitmap. So * if we're inserting a bitmap and we find an entry at * this offset, we want to go right, or after this entry * logically. If we are inserting an extent and we've * found a bitmap, we want to go left, or before * logically. */ if (new_entry->bitmap) { if (info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; } p = &(*p)->rb_right; } else { if (!info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; } p = &(*p)->rb_left; } } } rb_link_node(&new_entry->offset_index, parent, p); rb_insert_color(&new_entry->offset_index, root); return 0; } /* * This is a little subtle. We *only* have ->max_extent_size set if we actually * searched through the bitmap and figured out the largest ->max_extent_size, * otherwise it's 0. In the case that it's 0 we don't want to tell the * allocator the wrong thing, we want to use the actual real max_extent_size * we've found already if it's larger, or we want to use ->bytes. * * This matters because find_free_space() will skip entries who's ->bytes is * less than the required bytes. So if we didn't search down this bitmap, we * may pick some previous entry that has a smaller ->max_extent_size than we * have. For example, assume we have two entries, one that has * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will * call into find_free_space(), and return with max_extent_size == 4K, because * that first bitmap entry had ->max_extent_size set, but the second one did * not. If instead we returned 8K we'd come in searching for 8K, and find the * 8K contiguous range. * * Consider the other case, we have 2 8K chunks in that second entry and still * don't have ->max_extent_size set. We'll return 16K, and the next time the * allocator comes in it'll fully search our second bitmap, and this time it'll * get an uptodate value of 8K as the maximum chunk size. Then we'll get the * right allocation the next loop through. */ static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) { if (entry->bitmap && entry->max_extent_size) return entry->max_extent_size; return entry->bytes; } /* * We want the largest entry to be leftmost, so this is inverted from what you'd * normally expect. */ static bool entry_less(struct rb_node *node, const struct rb_node *parent) { const struct btrfs_free_space *entry, *exist; entry = rb_entry(node, struct btrfs_free_space, bytes_index); exist = rb_entry(parent, struct btrfs_free_space, bytes_index); return get_max_extent_size(exist) < get_max_extent_size(entry); } /* * searches the tree for the given offset. * * fuzzy - If this is set, then we are trying to make an allocation, and we just * want a section that has at least bytes size and comes at or after the given * offset. */ static struct btrfs_free_space * tree_search_offset(struct btrfs_free_space_ctl *ctl, u64 offset, int bitmap_only, int fuzzy) { struct rb_node *n = ctl->free_space_offset.rb_node; struct btrfs_free_space *entry = NULL, *prev = NULL; lockdep_assert_held(&ctl->tree_lock); /* find entry that is closest to the 'offset' */ while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); prev = entry; if (offset < entry->offset) n = n->rb_left; else if (offset > entry->offset) n = n->rb_right; else break; entry = NULL; } if (bitmap_only) { if (!entry) return NULL; if (entry->bitmap) return entry; /* * bitmap entry and extent entry may share same offset, * in that case, bitmap entry comes after extent entry. */ n = rb_next(n); if (!n) return NULL; entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->offset != offset) return NULL; WARN_ON(!entry->bitmap); return entry; } else if (entry) { if (entry->bitmap) { /* * if previous extent entry covers the offset, * we should return it instead of the bitmap entry */ n = rb_prev(&entry->offset_index); if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); if (!prev->bitmap && prev->offset + prev->bytes > offset) entry = prev; } } return entry; } if (!prev) return NULL; /* find last entry before the 'offset' */ entry = prev; if (entry->offset > offset) { n = rb_prev(&entry->offset_index); if (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); ASSERT(entry->offset <= offset); } else { if (fuzzy) return entry; else return NULL; } } if (entry->bitmap) { n = rb_prev(&entry->offset_index); if (n) { prev = rb_entry(n, struct btrfs_free_space, offset_index); if (!prev->bitmap && prev->offset + prev->bytes > offset) return prev; } if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) return entry; } else if (entry->offset + entry->bytes > offset) return entry; if (!fuzzy) return NULL; while (1) { n = rb_next(&entry->offset_index); if (!n) return NULL; entry = rb_entry(n, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) break; } else { if (entry->offset + entry->bytes > offset) break; } } return entry; } static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { lockdep_assert_held(&ctl->tree_lock); rb_erase(&info->offset_index, &ctl->free_space_offset); rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; } if (update_stat) ctl->free_space -= info->bytes; } static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { int ret = 0; lockdep_assert_held(&ctl->tree_lock); ASSERT(info->bytes || info->bitmap); ret = tree_insert_offset(ctl, NULL, info); if (ret) return ret; rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); if (!info->bitmap && !btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; } ctl->free_space += info->bytes; ctl->free_extents++; return ret; } static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { ASSERT(info->bitmap); /* * If our entry is empty it's because we're on a cluster and we don't * want to re-link it into our ctl bytes index. */ if (RB_EMPTY_NODE(&info->bytes_index)) return; lockdep_assert_held(&ctl->tree_lock); rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); } static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stat) { unsigned long start, count, end; int extent_delta = -1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); end = start + count; ASSERT(end <= BITS_PER_BITMAP); bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; relink_bitmap_entry(ctl, info); if (start && test_bit(start - 1, info->bitmap)) extent_delta++; if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) extent_delta++; info->bitmap_extents += extent_delta; if (!btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; } if (update_stat) ctl->free_space -= bytes; } static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) { unsigned long start, count, end; int extent_delta = 1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); end = start + count; ASSERT(end <= BITS_PER_BITMAP); bitmap_set(info->bitmap, start, count); /* * We set some bytes, we have no idea what the max extent size is * anymore. */ info->max_extent_size = 0; info->bytes += bytes; ctl->free_space += bytes; relink_bitmap_entry(ctl, info); if (start && test_bit(start - 1, info->bitmap)) extent_delta--; if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) extent_delta--; info->bitmap_extents += extent_delta; if (!btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; } } /* * If we can not find suitable extent, we will use bytes to record * the size of the max extent. */ static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes, bool for_alloc) { unsigned long found_bits = 0; unsigned long max_bits = 0; unsigned long bits, i; unsigned long next_zero; unsigned long extent_bits; /* * Skip searching the bitmap if we don't have a contiguous section that * is large enough for this allocation. */ if (for_alloc && bitmap_info->max_extent_size && bitmap_info->max_extent_size < *bytes) { *bytes = bitmap_info->max_extent_size; return -1; } i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { if (for_alloc && bits == 1) { found_bits = 1; break; } next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); extent_bits = next_zero - i; if (extent_bits >= bits) { found_bits = extent_bits; break; } else if (extent_bits > max_bits) { max_bits = extent_bits; } i = next_zero; } if (found_bits) { *offset = (u64)(i * ctl->unit) + bitmap_info->offset; *bytes = (u64)(found_bits) * ctl->unit; return 0; } *bytes = (u64)(max_bits) * ctl->unit; bitmap_info->max_extent_size = *bytes; relink_bitmap_entry(ctl, bitmap_info); return -1; } /* Cache the size of the max extent in bytes */ static struct btrfs_free_space * find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, unsigned long align, u64 *max_extent_size, bool use_bytes_index) { struct btrfs_free_space *entry; struct rb_node *node; u64 tmp; u64 align_off; int ret; if (!ctl->free_space_offset.rb_node) goto out; again: if (use_bytes_index) { node = rb_first_cached(&ctl->free_space_bytes); } else { entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); if (!entry) goto out; node = &entry->offset_index; } for (; node; node = rb_next(node)) { if (use_bytes_index) entry = rb_entry(node, struct btrfs_free_space, bytes_index); else entry = rb_entry(node, struct btrfs_free_space, offset_index); /* * If we are using the bytes index then all subsequent entries * in this tree are going to be < bytes, so simply set the max * extent size and exit the loop. * * If we're using the offset index then we need to keep going * through the rest of the tree. */ if (entry->bytes < *bytes) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); if (use_bytes_index) break; continue; } /* make sure the space returned is big enough * to match our requested alignment */ if (*bytes >= align) { tmp = entry->offset - ctl->start + align - 1; tmp = div64_u64(tmp, align); tmp = tmp * align + ctl->start; align_off = tmp - entry->offset; } else { align_off = 0; tmp = entry->offset; } /* * We don't break here if we're using the bytes index because we * may have another entry that has the correct alignment that is * the right size, so we don't want to miss that possibility. * At worst this adds another loop through the logic, but if we * broke here we could prematurely ENOSPC. */ if (entry->bytes < *bytes + align_off) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); continue; } if (entry->bitmap) { struct rb_node *old_next = rb_next(node); u64 size = *bytes; ret = search_bitmap(ctl, entry, &tmp, &size, true); if (!ret) { *offset = tmp; *bytes = size; return entry; } else { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); } /* * The bitmap may have gotten re-arranged in the space * index here because the max_extent_size may have been * updated. Start from the beginning again if this * happened. */ if (use_bytes_index && old_next != rb_next(node)) goto again; continue; } *offset = tmp; *bytes = entry->bytes - align_off; return entry; } out: return NULL; } static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; info->bitmap_extents = 0; INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; recalculate_thresholds(ctl); } static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { /* * Normally when this is called, the bitmap is completely empty. However, * if we are blowing up the free space cache for one reason or another * via __btrfs_remove_free_space_cache(), then it may not be freed and * we may leave stats on the table. */ if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { ctl->discardable_extents[BTRFS_STAT_CURR] -= bitmap_info->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; } unlink_free_space(ctl, bitmap_info, true); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); ctl->total_bitmaps--; recalculate_thresholds(ctl); } static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) { u64 end; u64 search_start, search_bytes; int ret; again: end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; /* * We need to search for bits in this bitmap. We could only cover some * of the extent in this bitmap thanks to how we add space, so we need * to search for as much as it as we can and clear that amount, and then * go searching for the next bit. */ search_start = *offset; search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, false); if (ret < 0 || search_start != *offset) return -EINVAL; /* We may have found more bits than what we need */ search_bytes = min(search_bytes, *bytes); /* Cannot clear past the end of the bitmap */ search_bytes = min(search_bytes, end - search_start + 1); bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); *offset += search_bytes; *bytes -= search_bytes; if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); if (!bitmap_info->bytes) free_bitmap(ctl, bitmap_info); /* * no entry after this bitmap, but we still have bytes to * remove, so something has gone wrong. */ if (!next) return -EINVAL; bitmap_info = rb_entry(next, struct btrfs_free_space, offset_index); /* * if the next entry isn't a bitmap we need to return to let the * extent stuff do its work. */ if (!bitmap_info->bitmap) return -EAGAIN; /* * Ok the next item is a bitmap, but it may not actually hold * the information for the rest of this free space stuff, so * look for it, and if we don't find it return so we can try * everything over again. */ search_start = *offset; search_bytes = ctl->unit; ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, false); if (ret < 0 || search_start != *offset) return -EAGAIN; goto again; } else if (!bitmap_info->bytes) free_bitmap(ctl, bitmap_info); return 0; } static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { u64 bytes_to_set = 0; u64 end; /* * This is a tradeoff to make bitmap trim state minimal. We mark the * whole bitmap untrimmed if at any point we add untrimmed regions. */ if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { if (btrfs_free_space_trimmed(info)) { ctl->discardable_extents[BTRFS_STAT_CURR] += info->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; } info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; } end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); bytes_to_set = min(end - offset, bytes); btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); return bytes_to_set; } static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_block_group *block_group = ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; #ifdef CONFIG_BTRFS_DEBUG if (btrfs_should_fragment_free_space(block_group)) forced = true; #endif /* This is a way to reclaim large regions from the bitmaps. */ if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) return false; /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ if (!forced && ctl->free_extents < ctl->extents_thresh) { /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want * to reserve them to larger extents, however if we have plenty * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ if (info->bytes <= fs_info->sectorsize * 8) { if (ctl->free_extents * 3 <= ctl->extents_thresh) return false; } else { return false; } } /* * The original block groups from mkfs can be really small, like 8 * megabytes, so don't bother with a bitmap for those entries. However * some block groups can be smaller than what a bitmap would cover but * are still large enough that they could overflow the 32k memory limit, * so allow those block groups to still be allowed to have a bitmap * entry. */ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) return false; return true; } static const struct btrfs_free_space_op free_space_op = { .use_bitmap = use_bitmap, }; static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; struct btrfs_block_group *block_group = NULL; int added = 0; u64 bytes, offset, bytes_added; enum btrfs_trim_state trim_state; int ret; bytes = info->bytes; offset = info->offset; trim_state = info->trim_state; if (!ctl->op->use_bitmap(ctl, info)) return 0; if (ctl->op == &free_space_op) block_group = ctl->block_group; again: /* * Since we link bitmaps right into the cluster we need to see if we * have a cluster here, and if so and it has our bitmap we need to add * the free space to that bitmap. */ if (block_group && !list_empty(&block_group->cluster_list)) { struct btrfs_free_cluster *cluster; struct rb_node *node; struct btrfs_free_space *entry; cluster = list_first_entry(&block_group->cluster_list, struct btrfs_free_cluster, block_group_list); spin_lock(&cluster->lock); node = rb_first(&cluster->root); if (!node) { spin_unlock(&cluster->lock); goto no_cluster_bitmap; } entry = rb_entry(node, struct btrfs_free_space, offset_index); if (!entry->bitmap) { spin_unlock(&cluster->lock); goto no_cluster_bitmap; } if (entry->offset == offset_to_bitmap(ctl, offset)) { bytes_added = add_bytes_to_bitmap(ctl, entry, offset, bytes, trim_state); bytes -= bytes_added; offset += bytes_added; } spin_unlock(&cluster->lock); if (!bytes) { ret = 1; goto out; } } no_cluster_bitmap: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { ASSERT(added == 0); goto new_bitmap; } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, trim_state); bytes -= bytes_added; offset += bytes_added; added = 0; if (!bytes) { ret = 1; goto out; } else goto again; new_bitmap: if (info && info->bitmap) { add_new_bitmap(ctl, info, offset); added = 1; info = NULL; goto again; } else { spin_unlock(&ctl->tree_lock); /* no pre-allocated info, allocate a new one */ if (!info) { info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) { spin_lock(&ctl->tree_lock); ret = -ENOMEM; goto out; } } /* allocate the bitmap */ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); info->trim_state = BTRFS_TRIM_STATE_TRIMMED; spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; goto out; } goto again; } out: if (info) { if (info->bitmap) kmem_cache_free(btrfs_free_space_bitmap_cachep, info->bitmap); kmem_cache_free(btrfs_free_space_cachep, info); } return ret; } /* * Free space merging rules: * 1) Merge trimmed areas together * 2) Let untrimmed areas coalesce with trimmed areas * 3) Always pull neighboring regions from bitmaps * * The above rules are for when we merge free space based on btrfs_trim_state. * Rules 2 and 3 are subtle because they are suboptimal, but are done for the * same reason: to promote larger extent regions which makes life easier for * find_free_extent(). Rule 2 enables coalescing based on the common path * being returning free space from btrfs_finish_extent_commit(). So when free * space is trimmed, it will prevent aggregating trimmed new region and * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents * and provide find_free_extent() with the largest extents possible hoping for * the reuse path. */ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *right_info; bool merged = false; u64 offset = info->offset; u64 bytes = info->bytes; const bool is_trimmed = btrfs_free_space_trimmed(info); struct rb_node *right_prev = NULL; /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ right_info = tree_search_offset(ctl, offset + bytes, 0, 0); if (right_info) right_prev = rb_prev(&right_info->offset_index); if (right_prev) left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); /* See try_merge_free_space() comment. */ if (right_info && !right_info->bitmap && (!is_trimmed || btrfs_free_space_trimmed(right_info))) { unlink_free_space(ctl, right_info, update_stat); info->bytes += right_info->bytes; kmem_cache_free(btrfs_free_space_cachep, right_info); merged = true; } /* See try_merge_free_space() comment. */ if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset && (!is_trimmed || btrfs_free_space_trimmed(left_info))) { unlink_free_space(ctl, left_info, update_stat); info->offset = left_info->offset; info->bytes += left_info->bytes; kmem_cache_free(btrfs_free_space_cachep, left_info); merged = true; } return merged; } static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { struct btrfs_free_space *bitmap; unsigned long i; unsigned long j; const u64 end = info->offset + info->bytes; const u64 bitmap_offset = offset_to_bitmap(ctl, end); u64 bytes; bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); if (!bitmap) return false; i = offset_to_bit(bitmap->offset, ctl->unit, end); j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); if (j == i) return false; bytes = (j - i) * ctl->unit; info->bytes += bytes; /* See try_merge_free_space() comment. */ if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); return true; } static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { struct btrfs_free_space *bitmap; u64 bitmap_offset; unsigned long i; unsigned long j; unsigned long prev_j; u64 bytes; bitmap_offset = offset_to_bitmap(ctl, info->offset); /* If we're on a boundary, try the previous logical bitmap. */ if (bitmap_offset == info->offset) { if (info->offset == 0) return false; bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); } bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); if (!bitmap) return false; i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; j = 0; prev_j = (unsigned long)-1; for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { if (j > i) break; prev_j = j; } if (prev_j == i) return false; if (prev_j == (unsigned long)-1) bytes = (i + 1) * ctl->unit; else bytes = (i - prev_j) * ctl->unit; info->offset -= bytes; info->bytes += bytes; /* See try_merge_free_space() comment. */ if (!btrfs_free_space_trimmed(bitmap)) info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); if (!bitmap->bytes) free_bitmap(ctl, bitmap); return true; } /* * We prefer always to allocate from extent entries, both for clustered and * non-clustered allocation requests. So when attempting to add a new extent * entry, try to see if there's adjacent free space in bitmap entries, and if * there is, migrate that space from the bitmaps to the extent. * Like this we get better chances of satisfying space allocation requests * because we attempt to satisfy them based on a single cache entry, and never * on 2 or more entries - even if the entries represent a contiguous free space * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry * ends). */ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { /* * Only work with disconnected entries, as we can change their offset, * and must be extent entries. */ ASSERT(!info->bitmap); ASSERT(RB_EMPTY_NODE(&info->offset_index)); if (ctl->total_bitmaps > 0) { bool stole_end; bool stole_front = false; stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); if (ctl->total_bitmaps > 0) stole_front = steal_from_bitmap_to_front(ctl, info, update_stat); if (stole_end || stole_front) try_merge_free_space(ctl, info, update_stat); } } static int __btrfs_add_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes, enum btrfs_trim_state trim_state) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret = 0; u64 filter_bytes = bytes; ASSERT(!btrfs_is_zoned(fs_info)); info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; info->offset = offset; info->bytes = bytes; info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); RB_CLEAR_NODE(&info->bytes_index); spin_lock(&ctl->tree_lock); if (try_merge_free_space(ctl, info, true)) goto link; /* * There was no extent directly to the left or right of this new * extent then we know we're going to have to allocate a new extent, so * before we do that see if we need to drop this into a bitmap */ ret = insert_into_bitmap(ctl, info); if (ret < 0) { goto out; } else if (ret) { ret = 0; goto out; } link: /* * Only steal free space from adjacent bitmaps if we're sure we're not * going to add the new free space to existing bitmap entries - because * that would mean unnecessary work that would be reverted. Therefore * attempt to steal space from bitmaps if we're adding an extent entry. */ steal_from_bitmap(ctl, info, true); filter_bytes = max(filter_bytes, info->bytes); ret = link_free_space(ctl, info); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (ret) { btrfs_crit(fs_info, "unable to add free space :%d", ret); ASSERT(ret != -EEXIST); } if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { btrfs_discard_check_filter(block_group, filter_bytes); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); } return ret; } static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { struct btrfs_space_info *sinfo = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; int bg_reclaim_threshold = 0; bool initial; u64 reclaimable_unusable; spin_lock(&block_group->lock); initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); WARN_ON(!initial && offset + size > block_group->zone_capacity); if (!initial) bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); if (!used) to_free = size; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) to_free = size; else if (offset + size <= block_group->alloc_offset) to_free = 0; else to_free = offset + size - block_group->alloc_offset; to_unusable = size - to_free; spin_lock(&ctl->tree_lock); ctl->free_space += to_free; spin_unlock(&ctl->tree_lock); /* * If the block group is read-only, we should account freed space into * bytes_readonly. */ if (!block_group->ro) { block_group->zone_unusable += to_unusable; WARN_ON(block_group->zone_unusable > block_group->length); } if (!used) { block_group->alloc_offset -= size; } reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } spin_unlock(&block_group->lock); return 0; } int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, true); if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, false); return btrfs_add_free_space(block_group, bytenr, size); } /* * This is a subtle distinction because when adding free space back in general, * we want it to be added as untrimmed for async. But in the case where we add * it on loading of a block group, we want to consider it trimmed. */ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; if (btrfs_is_zoned(block_group->fs_info)) return __btrfs_add_free_space_zoned(block_group, bytenr, size, true); if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; return __btrfs_add_free_space(block_group, bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; int ret; bool re_search = false; if (btrfs_is_zoned(block_group->fs_info)) { /* * This can happen with conventional zones when replaying log. * Since the allocation info of tree-log nodes are not recorded * to the extent-tree, calculate_alloc_pointer() failed to * advance the allocation pointer after last allocated tree log * node blocks. * * This function is called from * btrfs_pin_extent_for_log_replay() when replaying the log. * Advance the pointer not to overwrite the tree-log nodes. */ if (block_group->start + block_group->alloc_offset < offset + bytes) { block_group->alloc_offset = offset + bytes - block_group->start; } return 0; } spin_lock(&ctl->tree_lock); again: ret = 0; if (!bytes) goto out_lock; info = tree_search_offset(ctl, offset, 0, 0); if (!info) { /* * oops didn't find an extent that matched the space we wanted * to remove, look for a bitmap instead */ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { /* * If we found a partial bit of our free space in a * bitmap but then couldn't find the other part this may * be a problem, so WARN about it. */ WARN_ON(re_search); goto out_lock; } } re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info, true); if (offset == info->offset) { u64 to_free = min(bytes, info->bytes); info->bytes -= to_free; info->offset += to_free; if (info->bytes) { ret = link_free_space(ctl, info); WARN_ON(ret); } else { kmem_cache_free(btrfs_free_space_cachep, info); } offset += to_free; bytes -= to_free; goto again; } else { u64 old_end = info->bytes + info->offset; info->bytes = offset - info->offset; ret = link_free_space(ctl, info); WARN_ON(ret); if (ret) goto out_lock; /* Not enough bytes in this entry to satisfy us */ if (old_end < offset + bytes) { bytes -= old_end - offset; offset = old_end; goto again; } else if (old_end == offset + bytes) { /* all done */ goto out_lock; } spin_unlock(&ctl->tree_lock); ret = __btrfs_add_free_space(block_group, offset + bytes, old_end - (offset + bytes), info->trim_state); WARN_ON(ret); goto out; } } ret = remove_from_bitmap(ctl, info, &offset, &bytes); if (ret == -EAGAIN) { re_search = true; goto again; } out_lock: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); out: return ret; } void btrfs_dump_free_space(struct btrfs_block_group *block_group, u64 bytes) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; struct rb_node *n; int count = 0; /* * Zoned btrfs does not use free space tree and cluster. Just print * out the free space after the allocation offset. */ if (btrfs_is_zoned(fs_info)) { btrfs_info(fs_info, "free space %llu active %d", block_group->zone_capacity - block_group->alloc_offset, test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)); return; } spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) count++; btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", info->offset, info->bytes, str_yes_no(info->bitmap)); } spin_unlock(&ctl->tree_lock); btrfs_info(fs_info, "block group has cluster?: %s", str_no_yes(list_empty(&block_group->cluster_list))); btrfs_info(fs_info, "%d free space entries at or bigger than %llu bytes", count, bytes); } void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl) { struct btrfs_fs_info *fs_info = block_group->fs_info; spin_lock_init(&ctl->tree_lock); ctl->unit = fs_info->sectorsize; ctl->start = block_group->start; ctl->block_group = block_group; ctl->op = &free_space_op; ctl->free_space_bytes = RB_ROOT_CACHED; INIT_LIST_HEAD(&ctl->trimming_ranges); mutex_init(&ctl->cache_writeout_mutex); /* * we only want to have 32k of ram per block group for keeping * track of free space, and if we pass 1/2 of that we want to * start converting things over to using bitmaps */ ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); } /* * for a given cluster, put all of its extents back into the free * space cache. If the block group passed doesn't match the block group * pointed to by the cluster, someone else raced in and freed the * cluster already. In that case, we just return without changing anything */ static void __btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct rb_node *node; lockdep_assert_held(&ctl->tree_lock); spin_lock(&cluster->lock); if (cluster->block_group != block_group) { spin_unlock(&cluster->lock); return; } cluster->block_group = NULL; cluster->window_start = 0; list_del_init(&cluster->block_group_list); node = rb_first(&cluster->root); while (node) { struct btrfs_free_space *entry; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); RB_CLEAR_NODE(&entry->offset_index); if (!entry->bitmap) { /* Merging treats extents as if they were new */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; } try_merge_free_space(ctl, entry, false); steal_from_bitmap(ctl, entry, false); /* As we insert directly, update these statistics */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]++; ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; } } tree_insert_offset(ctl, NULL, entry); rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, entry_less); } cluster->root = RB_ROOT; spin_unlock(&cluster->lock); btrfs_put_block_group(block_group); } void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_cluster *cluster; struct list_head *head; spin_lock(&ctl->tree_lock); while ((head = block_group->cluster_list.next) != &block_group->cluster_list) { cluster = list_entry(head, struct btrfs_free_cluster, block_group_list); WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache(ctl); btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); } /* * Walk @block_group's free space rb_tree to determine if everything is trimmed. */ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; struct rb_node *node; bool ret = true; spin_lock(&ctl->tree_lock); node = rb_first(&ctl->free_space_offset); while (node) { info = rb_entry(node, struct btrfs_free_space, offset_index); if (!btrfs_free_space_trimmed(info)) { ret = false; break; } node = rb_next(node); } spin_unlock(&ctl->tree_lock); return ret; } u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; u64 align_gap = 0; u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; bool use_bytes_index = (offset == block_group->start); ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, block_group->full_stripe_len, max_extent_size, use_bytes_index); if (!entry) goto out; ret = offset; if (entry->bitmap) { bitmap_clear_bits(ctl, entry, offset, bytes, true); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); if (!entry->bytes) free_bitmap(ctl, entry); } else { unlink_free_space(ctl, entry, true); align_gap_len = offset - entry->offset; align_gap = entry->offset; align_gap_trim_state = entry->trim_state; if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); entry->offset = offset + bytes; WARN_ON(entry->bytes < bytes + align_gap_len); entry->bytes -= bytes + align_gap_len; if (!entry->bytes) kmem_cache_free(btrfs_free_space_cachep, entry); else link_free_space(ctl, entry); } out: btrfs_discard_update_discardable(block_group); spin_unlock(&ctl->tree_lock); if (align_gap_len) __btrfs_add_free_space(block_group, align_gap, align_gap_len, align_gap_trim_state); return ret; } /* * given a cluster, put all of its extents back into the free space * cache. If a block group is passed, this function will only free * a cluster that belongs to the passed block group. * * Otherwise, it'll get a reference on the block group pointed to by the * cluster and remove the cluster from it. */ void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; /* first, get a safe pointer to the block group */ spin_lock(&cluster->lock); if (!block_group) { block_group = cluster->block_group; if (!block_group) { spin_unlock(&cluster->lock); return; } } else if (cluster->block_group != block_group) { /* someone else has already freed it don't redo their work */ spin_unlock(&cluster->lock); return; } btrfs_get_block_group(block_group); spin_unlock(&cluster->lock); ctl = block_group->free_space_ctl; /* now return any extents the cluster had on it */ spin_lock(&ctl->tree_lock); __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); /* finally drop our ref */ btrfs_put_block_group(block_group); } static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct btrfs_free_space *entry, u64 bytes, u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int err; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; search_start = min_start; search_bytes = bytes; err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); return 0; } ret = search_start; bitmap_clear_bits(ctl, entry, ret, bytes, false); return ret; } /* * given a cluster, try to allocate 'bytes' from it, returns 0 * if it couldn't find anything suitably large, or a logical disk offset * if things worked out */ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; struct rb_node *node; u64 ret = 0; ASSERT(!btrfs_is_zoned(block_group->fs_info)); spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; if (cluster->block_group != block_group) goto out; node = rb_first(&cluster->root); if (!node) goto out; entry = rb_entry(node, struct btrfs_free_space, offset_index); while (1) { if (entry->bytes < bytes) *max_extent_size = max(get_max_extent_size(entry), *max_extent_size); if (entry->bytes < bytes || (!entry->bitmap && entry->offset < min_start)) { node = rb_next(&entry->offset_index); if (!node) break; entry = rb_entry(node, struct btrfs_free_space, offset_index); continue; } if (entry->bitmap) { ret = btrfs_alloc_from_bitmap(block_group, cluster, entry, bytes, cluster->window_start, max_extent_size); if (ret == 0) { node = rb_next(&entry->offset_index); if (!node) break; entry = rb_entry(node, struct btrfs_free_space, offset_index); continue; } cluster->window_start += bytes; } else { ret = entry->offset; entry->offset += bytes; entry->bytes -= bytes; } break; } out: spin_unlock(&cluster->lock); if (!ret) return 0; spin_lock(&ctl->tree_lock); if (!btrfs_free_space_trimmed(entry)) atomic64_add(bytes, &discard_ctl->discard_bytes_saved); ctl->free_space -= bytes; if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; spin_lock(&cluster->lock); if (entry->bytes == 0) { rb_erase(&entry->offset_index, &cluster->root); ctl->free_extents--; if (entry->bitmap) { kmem_cache_free(btrfs_free_space_bitmap_cachep, entry->bitmap); ctl->total_bitmaps--; recalculate_thresholds(ctl); } else if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; } kmem_cache_free(btrfs_free_space_cachep, entry); } spin_unlock(&cluster->lock); spin_unlock(&ctl->tree_lock); return ret; } static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, struct btrfs_free_space *entry, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; unsigned long next_zero; unsigned long i; unsigned long want_bits; unsigned long min_bits; unsigned long found_bits; unsigned long max_bits = 0; unsigned long start = 0; unsigned long total_found = 0; int ret; lockdep_assert_held(&ctl->tree_lock); i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); want_bits = bytes_to_bits(bytes, ctl->unit); min_bits = bytes_to_bits(min_bytes, ctl->unit); /* * Don't bother looking for a cluster in this bitmap if it's heavily * fragmented. */ if (entry->max_extent_size && entry->max_extent_size < cont1_bytes) return -ENOSPC; again: found_bits = 0; for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { next_zero = find_next_zero_bit(entry->bitmap, BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { found_bits = next_zero - i; if (found_bits > max_bits) max_bits = found_bits; break; } if (next_zero - i > max_bits) max_bits = next_zero - i; i = next_zero; } if (!found_bits) { entry->max_extent_size = (u64)max_bits * ctl->unit; return -ENOSPC; } if (!total_found) { start = i; cluster->max_size = 0; } total_found += found_bits; if (cluster->max_size < found_bits * ctl->unit) cluster->max_size = found_bits * ctl->unit; if (total_found < want_bits || cluster->max_size < cont1_bytes) { i = next_zero + 1; goto again; } cluster->window_start = start * ctl->unit + entry->offset; rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); /* * We need to know if we're currently on the normal space index when we * manipulate the bitmap so that we know we need to remove and re-insert * it into the space_index tree. Clear the bytes_index node here so the * bitmap manipulation helpers know not to mess with the space_index * until this bitmap entry is added back into the normal cache. */ RB_CLEAR_NODE(&entry->bytes_index); ret = tree_insert_offset(ctl, cluster, entry); ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, total_found * ctl->unit, 1); return 0; } /* * This searches the block group for just extents to fill the cluster with. * Try to find a cluster with at least bytes total bytes, at least one * extent of cont1_bytes, and other clusters of at least min_bytes. */ static noinline int setup_cluster_no_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; struct btrfs_free_space *last; struct rb_node *node; u64 window_free; u64 max_extent; u64 total_size = 0; lockdep_assert_held(&ctl->tree_lock); entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) return -ENOSPC; /* * We don't want bitmaps, so just move along until we find a normal * extent entry. */ while (entry->bitmap || entry->bytes < min_bytes) { if (entry->bitmap && list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); } window_free = entry->bytes; max_extent = entry->bytes; first = entry; last = entry; for (node = rb_next(&entry->offset_index); node; node = rb_next(&entry->offset_index)) { entry = rb_entry(node, struct btrfs_free_space, offset_index); if (entry->bitmap) { if (list_empty(&entry->list)) list_add_tail(&entry->list, bitmaps); continue; } if (entry->bytes < min_bytes) continue; last = entry; window_free += entry->bytes; if (entry->bytes > max_extent) max_extent = entry->bytes; } if (window_free < bytes || max_extent < cont1_bytes) return -ENOSPC; cluster->window_start = first->offset; node = &first->offset_index; /* * now we've found our entries, pull them out of the free space * cache and put them into the cluster rbtree */ do { int ret; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); if (entry->bitmap || entry->bytes < min_bytes) continue; rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); ret = tree_insert_offset(ctl, cluster, entry); total_size += entry->bytes; ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); cluster->max_size = max_extent; trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); return 0; } /* * This specifically looks for bitmaps that may work in the cluster, we assume * that we have already failed to find extents that will work. */ static noinline int setup_cluster_bitmap(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, struct list_head *bitmaps, u64 offset, u64 bytes, u64 cont1_bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry = NULL; int ret = -ENOSPC; u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* * The bitmap that covers offset won't be in the list unless offset * is just its start offset. */ if (!list_empty(bitmaps)) entry = list_first_entry(bitmaps, struct btrfs_free_space, list); if (!entry || entry->offset != bitmap_offset) { entry = tree_search_offset(ctl, bitmap_offset, 1, 0); if (entry && list_empty(&entry->list)) list_add(&entry->list, bitmaps); } list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, bytes, cont1_bytes, min_bytes); if (!ret) return 0; } /* * The bitmaps list has all the bitmaps that record free space * starting after offset, so no more search is required. */ return -ENOSPC; } /* * here we try to find a cluster of blocks in a block group. The goal * is to find at least bytes+empty_size. * We might not find them all in one contiguous area. * * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry, *tmp; LIST_HEAD(bitmaps); u64 min_bytes; u64 cont1_bytes; int ret; /* * Choose the minimum extent size we'll require for this * cluster. For SSD_SPREAD, don't allow any fragmentation. * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { cont1_bytes = bytes + empty_size; min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; } else { cont1_bytes = max(bytes, (bytes + empty_size) >> 2); min_bytes = fs_info->sectorsize; } spin_lock(&ctl->tree_lock); /* * If we know we don't have enough space to make a cluster don't even * bother doing all the work to try and find one. */ if (ctl->free_space < bytes) { spin_unlock(&ctl->tree_lock); return -ENOSPC; } spin_lock(&cluster->lock); /* someone already found a cluster, hooray */ if (cluster->block_group) { ret = 0; goto out; } trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, min_bytes); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); if (ret) ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); /* Clear our temporary list */ list_for_each_entry_safe(entry, tmp, &bitmaps, list) list_del_init(&entry->list); if (!ret) { btrfs_get_block_group(block_group); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; } else { trace_btrfs_failed_cluster_setup(block_group); } out: spin_unlock(&cluster->lock); spin_unlock(&ctl->tree_lock); return ret; } /* * simple code to zero out a cluster */ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) { spin_lock_init(&cluster->lock); spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; cluster->fragmented = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, enum btrfs_trim_state reserved_trim_state, struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; const u64 end = start + bytes; const u64 reserved_end = reserved_start + reserved_bytes; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; u64 trimmed = 0; spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (!block_group->ro) { block_group->reserved += reserved_bytes; space_info->bytes_reserved += reserved_bytes; update = 1; } spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); if (!ret) { *total_trimmed += trimmed; trim_state = BTRFS_TRIM_STATE_TRIMMED; } mutex_lock(&ctl->cache_writeout_mutex); if (reserved_start < start) __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); if (end < reserved_end) __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); __btrfs_add_free_space(block_group, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); if (update) { spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (block_group->ro) space_info->bytes_readonly += reserved_bytes; block_group->reserved -= reserved_bytes; space_info->bytes_reserved -= reserved_bytes; spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); } return ret; } /* * If @async is set, then we will trim 1 region and return. */ static int trim_no_bitmap(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen, bool async) { struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; struct rb_node *node; int ret = 0; u64 extent_start; u64 extent_bytes; enum btrfs_trim_state extent_trim_state; u64 bytes; const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (start < end) { struct btrfs_trim_range trim_entry; mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) goto out_unlock; entry = tree_search_offset(ctl, start, 0, 1); if (!entry) goto out_unlock; /* Skip bitmaps and if async, already trimmed entries */ while (entry->bitmap || (async && btrfs_free_space_trimmed(entry))) { node = rb_next(&entry->offset_index); if (!node) goto out_unlock; entry = rb_entry(node, struct btrfs_free_space, offset_index); } if (entry->offset >= end) goto out_unlock; extent_start = entry->offset; extent_bytes = entry->bytes; extent_trim_state = entry->trim_state; if (async) { start = entry->offset; bytes = entry->bytes; if (bytes < minlen) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } unlink_free_space(ctl, entry, true); /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim * X when we come back around. So trim it now. */ if (max_discard_size && bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER)) { bytes = max_discard_size; extent_bytes = max_discard_size; entry->offset += max_discard_size; entry->bytes -= max_discard_size; link_free_space(ctl, entry); } else { kmem_cache_free(btrfs_free_space_cachep, entry); } } else { start = max(start, extent_start); bytes = min(extent_start + extent_bytes, end) - start; if (bytes < minlen) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } unlink_free_space(ctl, entry, true); kmem_cache_free(btrfs_free_space_cachep, entry); } spin_unlock(&ctl->tree_lock); trim_entry.start = extent_start; trim_entry.bytes = extent_bytes; list_add_tail(&trim_entry.list, &ctl->trimming_ranges); mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, extent_start, extent_bytes, extent_trim_state, &trim_entry); if (ret) { block_group->discard_cursor = start + bytes; break; } next: start += bytes; block_group->discard_cursor = start; if (async && *total_trimmed) break; if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } cond_resched(); } return ret; out_unlock: block_group->discard_cursor = btrfs_block_group_end(block_group); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); return ret; } /* * If we break out of trimming a bitmap prematurely, we should reset the * trimming bit. In a rather contrieved case, it's possible to race here so * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. * * start = start of bitmap * end = near end of bitmap * * Thread 1: Thread 2: * trim_bitmaps(start) * trim_bitmaps(end) * end_trimming_bitmap() * reset_trimming_bitmap() */ static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { struct btrfs_free_space *entry; spin_lock(&ctl->tree_lock); entry = tree_search_offset(ctl, offset, 1, 0); if (entry) { if (btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR] += entry->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; } entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; } spin_unlock(&ctl->tree_lock); } static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *entry) { if (btrfs_free_space_trimming_bitmap(entry)) { entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; ctl->discardable_extents[BTRFS_STAT_CURR] -= entry->bitmap_extents; ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; } } /* * If @async is set, then we will trim 1 region and return. */ static int trim_bitmaps(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async) { struct btrfs_discard_ctl *discard_ctl = &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; int ret = 0; int ret2; u64 bytes; u64 offset = offset_to_bitmap(ctl, start); const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (offset < end) { bool next_bitmap = false; struct btrfs_trim_range trim_entry; mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { block_group->discard_cursor = btrfs_block_group_end(block_group); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); /* * Bitmaps are marked trimmed lossily now to prevent constant * discarding of the same bitmap (the reason why we are bound * by the filters). So, retrim the block group bitmaps when we * are preparing to punt to the unused_bgs list. This uses * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED * which is the only discard index which sets minlen to 0. */ if (!entry || (async && minlen && start == offset && btrfs_free_space_trimmed(entry))) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } /* * Async discard bitmap trimming begins at by setting the start * to be key.objectid and the offset_to_bitmap() aligns to the * start of the bitmap. This lets us know we are fully * scanning the bitmap rather than only some portion of it. */ if (start == offset) entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; bytes = minlen; ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { /* * We lossily consider a bitmap trimmed if we only skip * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. */ if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) end_trimming_bitmap(ctl, entry); else entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } /* * We already trimmed a region, but are using the locking above * to reset the trim_state. */ if (async && *total_trimmed) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto out; } bytes = min(bytes, end - start); if (bytes < minlen || (async && maxlen && bytes > maxlen)) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } /* * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. * If X < @minlen, we won't trim X when we come back around. * So trim it now. We differ here from trimming extents as we * don't keep individual state per bit. */ if (async && max_discard_size && bytes > (max_discard_size + minlen)) bytes = max_discard_size; bitmap_clear_bits(ctl, entry, start, bytes, true); if (entry->bytes == 0) free_bitmap(ctl, entry); spin_unlock(&ctl->tree_lock); trim_entry.start = start; trim_entry.bytes = bytes; list_add_tail(&trim_entry.list, &ctl->trimming_ranges); mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, start, bytes, 0, &trim_entry); if (ret) { reset_trimming_bitmap(ctl, offset); block_group->discard_cursor = btrfs_block_group_end(block_group); break; } next: if (next_bitmap) { offset += BITS_PER_BITMAP * ctl->unit; start = offset; } else { start += bytes; } block_group->discard_cursor = start; if (btrfs_trim_interrupted()) { if (start != offset) reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; break; } cond_resched(); } if (offset >= end) block_group->discard_cursor = end; out: return ret; } int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; u64 rem = 0; ASSERT(!btrfs_is_zoned(block_group->fs_info)); *trimmed = 0; spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); if (ret) goto out; ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); /* If we ended in the middle of a bitmap, reset the trimming flag */ if (rem) reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); out: btrfs_unfreeze_block_group(block_group); return ret; } int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, bool async) { int ret; *trimmed = 0; spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); btrfs_unfreeze_block_group(block_group); return ret; } int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen, u64 maxlen, bool async) { int ret; *trimmed = 0; spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); return 0; } btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, async); btrfs_unfreeze_block_group(block_group); return ret; } bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) { return btrfs_super_cache_generation(fs_info->super_copy); } static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans) { struct btrfs_block_group *block_group; struct rb_node *node; int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); if (ret) goto out; node = rb_next(node); } out: return ret; } int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) { struct btrfs_trans_handle *trans; int ret; /* * update_super_roots will appropriately set or unset * super_copy->cache_generation based on SPACE_CACHE and * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a * transaction commit whether we are enabling space cache v1 and don't * have any other work to do, or are disabling it and removing free * space inodes. */ trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); if (!active) { set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); ret = cleanup_free_space_cache_v1(fs_info, trans); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } } ret = btrfs_commit_transaction(trans); out: clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); return ret; } int __init btrfs_free_space_init(void) { btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); if (!btrfs_free_space_cachep) return -ENOMEM; btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", PAGE_SIZE, PAGE_SIZE, 0, NULL); if (!btrfs_free_space_bitmap_cachep) { kmem_cache_destroy(btrfs_free_space_cachep); return -ENOMEM; } return 0; } void __cold btrfs_free_space_exit(void) { kmem_cache_destroy(btrfs_free_space_cachep); kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it * doesn't do any of the merging that add_free_space does, this acts a lot like * how the free space cache loading stuff works, so you can get really weird * configurations. */ int test_add_free_space_entry(struct btrfs_block_group *cache, u64 offset, u64 bytes, bool bitmap) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info = NULL, *bitmap_info; void *map = NULL; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; u64 bytes_added; int ret; again: if (!info) { info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; } if (!bitmap) { spin_lock(&ctl->tree_lock); info->offset = offset; info->bytes = bytes; info->max_extent_size = 0; ret = link_free_space(ctl, info); spin_unlock(&ctl->tree_lock); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); return ret; } if (!map) { map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!map) { kmem_cache_free(btrfs_free_space_cachep, info); return -ENOMEM; } } spin_lock(&ctl->tree_lock); bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { info->bitmap = map; map = NULL; add_new_bitmap(ctl, info, offset); bitmap_info = info; info = NULL; } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, trim_state); bytes -= bytes_added; offset += bytes_added; spin_unlock(&ctl->tree_lock); if (bytes) goto again; if (info) kmem_cache_free(btrfs_free_space_cachep, info); if (map) kmem_cache_free(btrfs_free_space_bitmap_cachep, map); return 0; } /* * Checks to see if the given range is in the free space cache. This is really * just used to check the absence of space, so if there is free space in the * range at all we will return 1. */ int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info; int ret = 0; spin_lock(&ctl->tree_lock); info = tree_search_offset(ctl, offset, 0, 0); if (!info) { info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) goto out; } have_info: if (info->bitmap) { u64 bit_off, bit_bytes; struct rb_node *n; struct btrfs_free_space *tmp; bit_off = offset; bit_bytes = ctl->unit; ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); if (!ret) { if (bit_off == offset) { ret = 1; goto out; } else if (bit_off > offset && offset + bytes > bit_off) { ret = 1; goto out; } } n = rb_prev(&info->offset_index); while (n) { tmp = rb_entry(n, struct btrfs_free_space, offset_index); if (tmp->offset + tmp->bytes < offset) break; if (offset + bytes < tmp->offset) { n = rb_prev(&tmp->offset_index); continue; } info = tmp; goto have_info; } n = rb_next(&info->offset_index); while (n) { tmp = rb_entry(n, struct btrfs_free_space, offset_index); if (offset + bytes < tmp->offset) break; if (tmp->offset + tmp->bytes < offset) { n = rb_next(&tmp->offset_index); continue; } info = tmp; goto have_info; } ret = 0; goto out; } if (info->offset == offset) { ret = 1; goto out; } if (offset > info->offset && offset < info->offset + info->bytes) ret = 1; out: spin_unlock(&ctl->tree_lock); return ret; } #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ |
| 178 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | /* * net/tipc/bearer.h: Include file for TIPC bearer code * * Copyright (c) 1996-2006, 2013-2016, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_BEARER_H #define _TIPC_BEARER_H #include "netlink.h" #include "core.h" #include "msg.h" #include <net/genetlink.h> #define MAX_MEDIA 3 /* Identifiers associated with TIPC message header media address info * - address info field is 32 bytes long * - the field's actual content and length is defined per media * - remaining unused bytes in the field are set to zero */ #define TIPC_MEDIA_INFO_SIZE 32 #define TIPC_MEDIA_TYPE_OFFSET 3 #define TIPC_MEDIA_ADDR_OFFSET 4 /* * Identifiers of supported TIPC media types */ #define TIPC_MEDIA_TYPE_ETH 1 #define TIPC_MEDIA_TYPE_IB 2 #define TIPC_MEDIA_TYPE_UDP 3 /* Minimum bearer MTU */ #define TIPC_MIN_BEARER_MTU (MAX_H_SIZE + INT_H_SIZE) /* Identifiers for distinguishing between broadcast/multicast and replicast */ #define TIPC_BROADCAST_SUPPORT 1 #define TIPC_REPLICAST_SUPPORT 2 /** * struct tipc_media_addr - destination address used by TIPC bearers * @value: address info (format defined by media) * @media_id: TIPC media type identifier * @broadcast: non-zero if address is a broadcast address */ struct tipc_media_addr { u8 value[TIPC_MEDIA_INFO_SIZE]; u8 media_id; u8 broadcast; }; struct tipc_bearer; /** * struct tipc_media - Media specific info exposed to generic bearer layer * @send_msg: routine which handles buffer transmission * @enable_media: routine which enables a media * @disable_media: routine which disables a media * @addr2str: convert media address format to string * @addr2msg: convert from media addr format to discovery msg addr format * @msg2addr: convert from discovery msg addr format to media addr format * @raw2addr: convert from raw addr format to media addr format * @priority: default link (and bearer) priority * @tolerance: default time (in ms) before declaring link failure * @min_win: minimum window (in packets) before declaring link congestion * @max_win: maximum window (in packets) before declaring link congestion * @mtu: max packet size bearer can support for media type not dependent on * underlying device MTU * @type_id: TIPC media identifier * @hwaddr_len: TIPC media address len * @name: media name */ struct tipc_media { int (*send_msg)(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); int (*enable_media)(struct net *net, struct tipc_bearer *b, struct nlattr *attr[]); void (*disable_media)(struct tipc_bearer *b); int (*addr2str)(struct tipc_media_addr *addr, char *strbuf, int bufsz); int (*addr2msg)(char *msg, struct tipc_media_addr *addr); int (*msg2addr)(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg); int (*raw2addr)(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *raw); u32 priority; u32 tolerance; u32 min_win; u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; char name[TIPC_MAX_MEDIA_NAME]; }; /** * struct tipc_bearer - Generic TIPC bearer structure * @media_ptr: pointer to additional media-specific information about bearer * @mtu: max packet size bearer can support * @addr: media-specific address associated with bearer * @name: bearer name (format = media:interface) * @media: ptr to media structure associated with bearer * @bcast_addr: media address used in broadcasting * @pt: packet type for bearer * @rcu: rcu struct for tipc_bearer * @priority: default link priority for bearer * @min_win: minimum window (in packets) before declaring link congestion * @max_win: maximum window (in packets) before declaring link congestion * @tolerance: default link tolerance for bearer * @domain: network domain to which links can be established * @identity: array index of this bearer within TIPC bearer array * @disc: ptr to link setup request * @net_plane: network plane ('A' through 'H') currently associated with bearer * @encap_hlen: encap headers length * @up: bearer up flag (bit 0) * @refcnt: tipc_bearer reference counter * * Note: media-specific code is responsible for initialization of the fields * indicated below when a bearer is enabled; TIPC's generic bearer code takes * care of initializing all other fields. */ struct tipc_bearer { void __rcu *media_ptr; /* initialized by media */ u32 mtu; /* initialized by media */ struct tipc_media_addr addr; /* initialized by media */ char name[TIPC_MAX_BEARER_NAME]; struct tipc_media *media; struct tipc_media_addr bcast_addr; struct packet_type pt; struct rcu_head rcu; u32 priority; u32 min_win; u32 max_win; u32 tolerance; u32 domain; u32 identity; struct tipc_discoverer *disc; char net_plane; u16 encap_hlen; unsigned long up; refcount_t refcnt; }; struct tipc_bearer_names { char media_name[TIPC_MAX_MEDIA_NAME]; char if_name[TIPC_MAX_IF_NAME]; }; /* * TIPC routines available to supported media types */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b); /* * Routines made available to TIPC by supported media types */ extern struct tipc_media eth_media_info; #ifdef CONFIG_TIPC_MEDIA_IB extern struct tipc_media ib_media_info; #endif #ifdef CONFIG_TIPC_MEDIA_UDP extern struct tipc_media udp_media_info; #endif int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attrs[]); bool tipc_bearer_hold(struct tipc_bearer *b); void tipc_bearer_put(struct tipc_bearer *b); void tipc_disable_l2_media(struct tipc_bearer *b); int tipc_l2_send_msg(struct net *net, struct sk_buff *buf, struct tipc_bearer *b, struct tipc_media_addr *dest); void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest); void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name); int tipc_bearer_get_name(struct net *net, char *name, u32 bearer_id); struct tipc_media *tipc_media_find(const char *name); int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); int tipc_bearer_mtu(struct net *net, u32 bearer_id); int tipc_bearer_min_mtu(struct net *net, u32 bearer_id); bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id); void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct sk_buff *skb, struct tipc_media_addr *dest); void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr *dst, struct tipc_node *__dnode); void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq); void tipc_clone_to_loopback(struct net *net, struct sk_buff_head *pkts); int tipc_attach_loopback(struct net *net); void tipc_detach_loopback(struct net *net); static inline void tipc_loopback_trace(struct net *net, struct sk_buff_head *pkts) { if (unlikely(dev_nit_active(net->loopback_dev))) tipc_clone_to_loopback(net, pkts); } /* check if device MTU is too low for tipc headers */ static inline bool tipc_mtu_bad(struct net_device *dev) { if (dev->mtu >= TIPC_MIN_BEARER_MTU) return false; netdev_warn(dev, "MTU too low for tipc bearer\n"); return true; } #endif /* _TIPC_BEARER_H */ |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. */ #ifndef LINUX_DMAENGINE_H #define LINUX_DMAENGINE_H #include <linux/device.h> #include <linux/err.h> #include <linux/uio.h> #include <linux/bug.h> #include <linux/scatterlist.h> #include <linux/bitmap.h> #include <linux/types.h> #include <asm/page.h> /** * typedef dma_cookie_t - an opaque DMA cookie * * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code */ typedef s32 dma_cookie_t; #define DMA_MIN_COOKIE 1 static inline int dma_submit_error(dma_cookie_t cookie) { return cookie < 0 ? cookie : 0; } /** * enum dma_status - DMA transaction status * @DMA_COMPLETE: transaction completed * @DMA_IN_PROGRESS: transaction not yet processed * @DMA_PAUSED: transaction is paused * @DMA_ERROR: transaction failed */ enum dma_status { DMA_COMPLETE, DMA_IN_PROGRESS, DMA_PAUSED, DMA_ERROR, DMA_OUT_OF_ORDER, }; /** * enum dma_transaction_type - DMA transaction types/indexes * * Note: The DMA_ASYNC_TX capability is not to be set by drivers. It is * automatically set as dma devices are registered. */ enum dma_transaction_type { DMA_MEMCPY, DMA_XOR, DMA_PQ, DMA_XOR_VAL, DMA_PQ_VAL, DMA_MEMSET, DMA_MEMSET_SG, DMA_INTERRUPT, DMA_PRIVATE, DMA_ASYNC_TX, DMA_SLAVE, DMA_CYCLIC, DMA_INTERLEAVE, DMA_COMPLETION_NO_ORDER, DMA_REPEAT, DMA_LOAD_EOT, /* last transaction type for creation of the capabilities mask */ DMA_TX_TYPE_END, }; /** * enum dma_transfer_direction - dma transfer mode and direction indicator * @DMA_MEM_TO_MEM: Async/Memcpy mode * @DMA_MEM_TO_DEV: Slave mode & From Memory to Device * @DMA_DEV_TO_MEM: Slave mode & From Device to Memory * @DMA_DEV_TO_DEV: Slave mode & From Device to Device */ enum dma_transfer_direction { DMA_MEM_TO_MEM, DMA_MEM_TO_DEV, DMA_DEV_TO_MEM, DMA_DEV_TO_DEV, DMA_TRANS_NONE, }; /* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. * The gap(in bytes) between two chunks is called inter-chunk-gap(ICG). * ICGs may or may not change between chunks. * A FRAME is the smallest series of contiguous {chunk,icg} pairs, * that when repeated an integral number of times, specifies the transfer. * A transfer template is specification of a Frame, the number of times * it is to be repeated and other per-transfer attributes. * * Practically, a client driver would have ready a template for each * type of transfer it is going to need during its lifetime and * set only 'src_start' and 'dst_start' before submitting the requests. * * * | Frame-1 | Frame-2 | ~ | Frame-'numf' | * |====....==.===...=...|====....==.===...=...| ~ |====....==.===...=...| * * == Chunk size * ... ICG */ /** * struct data_chunk - Element of scatter-gather list that makes a frame. * @size: Number of bytes to read from source. * size_dst := fn(op, size_src), so doesn't mean much for destination. * @icg: Number of bytes to jump after last src/dst address of this * chunk and before first src/dst address for next chunk. * Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false. * Ignored for src(assumed 0), if src_inc is true and src_sgl is false. * @dst_icg: Number of bytes to jump after last dst address of this * chunk and before the first dst address for next chunk. * Ignored if dst_inc is true and dst_sgl is false. * @src_icg: Number of bytes to jump after last src address of this * chunk and before the first src address for next chunk. * Ignored if src_inc is true and src_sgl is false. */ struct data_chunk { size_t size; size_t icg; size_t dst_icg; size_t src_icg; }; /** * struct dma_interleaved_template - Template to convey DMAC the transfer pattern * and attributes. * @src_start: Bus address of source for the first chunk. * @dst_start: Bus address of destination for the first chunk. * @dir: Specifies the type of Source and Destination. * @src_inc: If the source address increments after reading from it. * @dst_inc: If the destination address increments after writing to it. * @src_sgl: If the 'icg' of sgl[] applies to Source (scattered read). * Otherwise, source is read contiguously (icg ignored). * Ignored if src_inc is false. * @dst_sgl: If the 'icg' of sgl[] applies to Destination (scattered write). * Otherwise, destination is filled contiguously (icg ignored). * Ignored if dst_inc is false. * @numf: Number of frames in this template. * @frame_size: Number of chunks in a frame i.e, size of sgl[]. * @sgl: Array of {chunk,icg} pairs that make up a frame. */ struct dma_interleaved_template { dma_addr_t src_start; dma_addr_t dst_start; enum dma_transfer_direction dir; bool src_inc; bool dst_inc; bool src_sgl; bool dst_sgl; size_t numf; size_t frame_size; struct data_chunk sgl[]; }; /** * struct dma_vec - DMA vector * @addr: Bus address of the start of the vector * @len: Length in bytes of the DMA vector */ struct dma_vec { dma_addr_t addr; size_t len; }; /** * enum dma_ctrl_flags - DMA flags to augment operation preparation, * control completion, and communicate status. * @DMA_PREP_INTERRUPT - trigger an interrupt (callback) upon completion of * this transaction * @DMA_CTRL_ACK - if clear, the descriptor cannot be reused until the client * acknowledges receipt, i.e. has a chance to establish any dependency * chains * @DMA_PREP_PQ_DISABLE_P - prevent generation of P while generating Q * @DMA_PREP_PQ_DISABLE_Q - prevent generation of Q while generating P * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as * sources that were the result of a previous operation, in the case of a PQ * operation it continues the calculation with new sources * @DMA_PREP_FENCE - tell the driver that subsequent operations depend * on the result of this operation * @DMA_CTRL_REUSE: client can reuse the descriptor and submit again till * cleared or freed * @DMA_PREP_CMD: tell the driver that the data passed to DMA API is command * data and the descriptor should be in different format from normal * data descriptors. * @DMA_PREP_REPEAT: tell the driver that the transaction shall be automatically * repeated when it ends until a transaction is issued on the same channel * with the DMA_PREP_LOAD_EOT flag set. This flag is only applicable to * interleaved transactions and is ignored for all other transaction types. * @DMA_PREP_LOAD_EOT: tell the driver that the transaction shall replace any * active repeated (as indicated by DMA_PREP_REPEAT) transaction when the * repeated transaction ends. Not setting this flag when the previously queued * transaction is marked with DMA_PREP_REPEAT will cause the new transaction * to never be processed and stay in the issued queue forever. The flag is * ignored if the previous transaction is not a repeated transaction. */ enum dma_ctrl_flags { DMA_PREP_INTERRUPT = (1 << 0), DMA_CTRL_ACK = (1 << 1), DMA_PREP_PQ_DISABLE_P = (1 << 2), DMA_PREP_PQ_DISABLE_Q = (1 << 3), DMA_PREP_CONTINUE = (1 << 4), DMA_PREP_FENCE = (1 << 5), DMA_CTRL_REUSE = (1 << 6), DMA_PREP_CMD = (1 << 7), DMA_PREP_REPEAT = (1 << 8), DMA_PREP_LOAD_EOT = (1 << 9), }; /** * enum sum_check_bits - bit position of pq_check_flags */ enum sum_check_bits { SUM_CHECK_P = 0, SUM_CHECK_Q = 1, }; /** * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ enum sum_check_flags { SUM_CHECK_P_RESULT = (1 << SUM_CHECK_P), SUM_CHECK_Q_RESULT = (1 << SUM_CHECK_Q), }; /** * dma_cap_mask_t - capabilities bitmap modeled after cpumask_t. * See linux/cpumask.h */ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; /** * enum dma_desc_metadata_mode - per descriptor metadata mode types supported * @DESC_METADATA_CLIENT - the metadata buffer is allocated/provided by the * client driver and it is attached (via the dmaengine_desc_attach_metadata() * helper) to the descriptor. * * Client drivers interested to use this mode can follow: * - DMA_MEM_TO_DEV / DEV_MEM_TO_MEM: * 1. prepare the descriptor (dmaengine_prep_*) * construct the metadata in the client's buffer * 2. use dmaengine_desc_attach_metadata() to attach the buffer to the * descriptor * 3. submit the transfer * - DMA_DEV_TO_MEM: * 1. prepare the descriptor (dmaengine_prep_*) * 2. use dmaengine_desc_attach_metadata() to attach the buffer to the * descriptor * 3. submit the transfer * 4. when the transfer is completed, the metadata should be available in the * attached buffer * * @DESC_METADATA_ENGINE - the metadata buffer is allocated/managed by the DMA * driver. The client driver can ask for the pointer, maximum size and the * currently used size of the metadata and can directly update or read it. * dmaengine_desc_get_metadata_ptr() and dmaengine_desc_set_metadata_len() is * provided as helper functions. * * Note: the metadata area for the descriptor is no longer valid after the * transfer has been completed (valid up to the point when the completion * callback returns if used). * * Client drivers interested to use this mode can follow: * - DMA_MEM_TO_DEV / DEV_MEM_TO_MEM: * 1. prepare the descriptor (dmaengine_prep_*) * 2. use dmaengine_desc_get_metadata_ptr() to get the pointer to the engine's * metadata area * 3. update the metadata at the pointer * 4. use dmaengine_desc_set_metadata_len() to tell the DMA engine the amount * of data the client has placed into the metadata buffer * 5. submit the transfer * - DMA_DEV_TO_MEM: * 1. prepare the descriptor (dmaengine_prep_*) * 2. submit the transfer * 3. on transfer completion, use dmaengine_desc_get_metadata_ptr() to get the * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { DESC_METADATA_NONE = 0, DESC_METADATA_CLIENT = BIT(0), DESC_METADATA_ENGINE = BIT(1), }; /** * struct dma_chan_percpu - the per-CPU part of struct dma_chan * @memcpy_count: transaction counter * @bytes_transferred: byte counter */ struct dma_chan_percpu { /* stats */ unsigned long memcpy_count; unsigned long bytes_transferred; }; /** * struct dma_router - DMA router structure * @dev: pointer to the DMA router device * @route_free: function to be called when the route can be disconnected */ struct dma_router { struct device *dev; void (*route_free)(struct device *dev, void *route_data); }; /** * struct dma_chan - devices supply DMA channels, clients use them * @device: ptr to the dma device who supplies this channel, always !%NULL * @slave: ptr to the device using this channel * @cookie: last cookie value returned to client * @completed_cookie: last completed cookie for this channel * @chan_id: channel ID for sysfs * @dev: class device for sysfs * @name: backlink name for sysfs * @dbg_client_name: slave name for debugfs in format: * dev_name(requester's dev):channel name, for example: "2b00000.mcasp:tx" * @device_node: used to add this to the device chan list * @local: per-cpu pointer to a struct dma_chan_percpu * @client_count: how many clients are using this channel * @table_count: number of appearances in the mem-to-mem allocation table * @router: pointer to the DMA router structure * @route_data: channel specific data for the router * @private: private data for certain client-channel associations */ struct dma_chan { struct dma_device *device; struct device *slave; dma_cookie_t cookie; dma_cookie_t completed_cookie; /* sysfs */ int chan_id; struct dma_chan_dev *dev; const char *name; #ifdef CONFIG_DEBUG_FS char *dbg_client_name; #endif struct list_head device_node; struct dma_chan_percpu __percpu *local; int client_count; int table_count; /* DMA router */ struct dma_router *router; void *route_data; void *private; }; /** * struct dma_chan_dev - relate sysfs device node to backing channel device * @chan: driver channel device * @device: sysfs device * @dev_id: parent dma_device dev_id * @chan_dma_dev: The channel is using custom/different dma-mapping * compared to the parent dma_device */ struct dma_chan_dev { struct dma_chan *chan; struct device device; int dev_id; bool chan_dma_dev; }; /** * enum dma_slave_buswidth - defines bus width of the DMA slave * device, source or target buses */ enum dma_slave_buswidth { DMA_SLAVE_BUSWIDTH_UNDEFINED = 0, DMA_SLAVE_BUSWIDTH_1_BYTE = 1, DMA_SLAVE_BUSWIDTH_2_BYTES = 2, DMA_SLAVE_BUSWIDTH_3_BYTES = 3, DMA_SLAVE_BUSWIDTH_4_BYTES = 4, DMA_SLAVE_BUSWIDTH_8_BYTES = 8, DMA_SLAVE_BUSWIDTH_16_BYTES = 16, DMA_SLAVE_BUSWIDTH_32_BYTES = 32, DMA_SLAVE_BUSWIDTH_64_BYTES = 64, DMA_SLAVE_BUSWIDTH_128_BYTES = 128, }; /** * struct dma_slave_config - dma slave channel runtime config * @direction: whether the data shall go in or out on this slave * channel, right now. DMA_MEM_TO_DEV and DMA_DEV_TO_MEM are * legal values. DEPRECATED, drivers should use the direction argument * to the device_prep_slave_sg and device_prep_dma_cyclic functions or * the dir field in the dma_interleaved_template structure. * @src_addr: this is the physical address where DMA slave data * should be read (RX), if the source is memory this argument is * ignored. * @dst_addr: this is the physical address where DMA slave data * should be written (TX), if the destination is memory this argument * is ignored. * @src_addr_width: this is the width in bytes of the source (RX) * register where DMA data shall be read. If the source * is memory this may be ignored depending on architecture. * Legal values: 1, 2, 3, 4, 8, 16, 32, 64, 128. * @dst_addr_width: same as src_addr_width but for destination * target (TX) mutatis mutandis. * @src_maxburst: the maximum number of words (note: words, as in * units of the src_addr_width member, not bytes) that can be sent * in one burst to the device. Typically something like half the * FIFO depth on I/O peripherals so you don't overflow it. This * may or may not be applicable on memory sources. * @dst_maxburst: same as src_maxburst but for destination target * mutatis mutandis. * @src_port_window_size: The length of the register area in words the data need * to be accessed on the device side. It is only used for devices which is using * an area instead of a single register to receive the data. Typically the DMA * loops in this area in order to transfer the data. * @dst_port_window_size: same as src_port_window_size but for the destination * port. * @device_fc: Flow Controller Settings. Only valid for slave channels. Fill * with 'true' if peripheral should be flow controller. Direction will be * selected at Runtime. * @peripheral_config: peripheral configuration for programming peripheral * for dmaengine transfer * @peripheral_size: peripheral configuration buffer size * * This struct is passed in as configuration data to a DMA engine * in order to set up a certain channel for DMA transport at runtime. * The DMA device/engine has to provide support for an additional * callback in the dma_device structure, device_config and this struct * will then be passed in as an argument to the function. * * The rationale for adding configuration information to this struct is as * follows: if it is likely that more than one DMA slave controllers in * the world will support the configuration option, then make it generic. * If not: if it is fixed so that it be sent in static from the platform * data, then prefer to do that. */ struct dma_slave_config { enum dma_transfer_direction direction; phys_addr_t src_addr; phys_addr_t dst_addr; enum dma_slave_buswidth src_addr_width; enum dma_slave_buswidth dst_addr_width; u32 src_maxburst; u32 dst_maxburst; u32 src_port_window_size; u32 dst_port_window_size; bool device_fc; void *peripheral_config; size_t peripheral_size; }; /** * enum dma_residue_granularity - Granularity of the reported transfer residue * @DMA_RESIDUE_GRANULARITY_DESCRIPTOR: Residue reporting is not support. The * DMA channel is only able to tell whether a descriptor has been completed or * not, which means residue reporting is not supported by this channel. The * residue field of the dma_tx_state field will always be 0. * @DMA_RESIDUE_GRANULARITY_SEGMENT: Residue is updated after each successfully * completed segment of the transfer (For cyclic transfers this is after each * period). This is typically implemented by having the hardware generate an * interrupt after each transferred segment and then the drivers updates the * outstanding residue by the size of the segment. Another possibility is if * the hardware supports scatter-gather and the segment descriptor has a field * which gets set after the segment has been completed. The driver then counts * the number of segments without the flag set to compute the residue. * @DMA_RESIDUE_GRANULARITY_BURST: Residue is updated after each transferred * burst. This is typically only supported if the hardware has a progress * register of some sort (E.g. a register with the current read/write address * or a register with the amount of bursts/beats/bytes that have been * transferred or still need to be transferred). */ enum dma_residue_granularity { DMA_RESIDUE_GRANULARITY_DESCRIPTOR = 0, DMA_RESIDUE_GRANULARITY_SEGMENT = 1, DMA_RESIDUE_GRANULARITY_BURST = 2, }; /** * struct dma_slave_caps - expose capabilities of a slave channel only * @src_addr_widths: bit mask of src addr widths the channel supports. * Width is specified in bytes, e.g. for a channel supporting * a width of 4 the mask should have BIT(4) set. * @dst_addr_widths: bit mask of dst addr widths the channel supports * @directions: bit mask of slave directions the channel supports. * Since the enum dma_transfer_direction is not defined as bit flag for * each type, the dma controller should set BIT(<TYPE>) and same * should be checked by controller as well * @min_burst: min burst capability per-transfer * @max_burst: max burst capability per-transfer * @max_sg_burst: max number of SG list entries executed in a single burst * DMA tansaction with no software intervention for reinitialization. * Zero value means unlimited number of entries. * @cmd_pause: true, if pause is supported (i.e. for reading residue or * for resume later) * @cmd_resume: true, if resume is supported * @cmd_terminate: true, if terminate cmd is supported * @residue_granularity: granularity of the reported transfer residue * @descriptor_reuse: if a descriptor can be reused by client and * resubmitted multiple times */ struct dma_slave_caps { u32 src_addr_widths; u32 dst_addr_widths; u32 directions; u32 min_burst; u32 max_burst; u32 max_sg_burst; bool cmd_pause; bool cmd_resume; bool cmd_terminate; enum dma_residue_granularity residue_granularity; bool descriptor_reuse; }; static inline const char *dma_chan_name(struct dma_chan *chan) { return dev_name(&chan->dev->device); } /** * typedef dma_filter_fn - callback filter for dma_request_channel * @chan: channel to be reviewed * @filter_param: opaque parameter passed through dma_request_channel * * When this optional parameter is specified in a call to dma_request_channel a * suitable channel is passed to this routine for further dispositioning before * being returned. Where 'suitable' indicates a non-busy channel that * satisfies the given capability mask. It returns 'true' to indicate that the * channel is suitable. */ typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); typedef void (*dma_async_tx_callback)(void *dma_async_param); enum dmaengine_tx_result { DMA_TRANS_NOERROR = 0, /* SUCCESS */ DMA_TRANS_READ_FAILED, /* Source DMA read failed */ DMA_TRANS_WRITE_FAILED, /* Destination DMA write failed */ DMA_TRANS_ABORTED, /* Op never submitted / aborted */ }; struct dmaengine_result { enum dmaengine_tx_result result; u32 residue; }; typedef void (*dma_async_tx_callback_result)(void *dma_async_param, const struct dmaengine_result *result); struct dmaengine_unmap_data { #if IS_ENABLED(CONFIG_DMA_ENGINE_RAID) u16 map_cnt; #else u8 map_cnt; #endif u8 to_cnt; u8 from_cnt; u8 bidi_cnt; struct device *dev; struct kref kref; size_t len; dma_addr_t addr[]; }; struct dma_async_tx_descriptor; struct dma_descriptor_metadata_ops { int (*attach)(struct dma_async_tx_descriptor *desc, void *data, size_t len); void *(*get_ptr)(struct dma_async_tx_descriptor *desc, size_t *payload_len, size_t *max_len); int (*set_len)(struct dma_async_tx_descriptor *desc, size_t payload_len); }; /** * struct dma_async_tx_descriptor - async transaction descriptor * ---dma generic offload fields--- * @cookie: tracking cookie for this transaction, set to -EBUSY if * this tx is sitting on a dependency list * @flags: flags to augment operation preparation, control completion, and * communicate status * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the * @desc_free: driver's callback function to free a resusable descriptor * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE * @metadata_ops: DMA driver provided metadata mode ops, need to be set by the * DMA driver if metadata mode is supported with the descriptor * ---async_tx api specific fields--- * @next: at completion submit this descriptor * @parent: pointer to the next level up in the dependency chain * @lock: protect the parent and next pointers */ struct dma_async_tx_descriptor { dma_cookie_t cookie; enum dma_ctrl_flags flags; /* not a 'long' to pack with cookie */ dma_addr_t phys; struct dma_chan *chan; dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx); int (*desc_free)(struct dma_async_tx_descriptor *tx); dma_async_tx_callback callback; dma_async_tx_callback_result callback_result; void *callback_param; struct dmaengine_unmap_data *unmap; enum dma_desc_metadata_mode desc_metadata_mode; struct dma_descriptor_metadata_ops *metadata_ops; #ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH struct dma_async_tx_descriptor *next; struct dma_async_tx_descriptor *parent; spinlock_t lock; #endif }; #ifdef CONFIG_DMA_ENGINE static inline void dma_set_unmap(struct dma_async_tx_descriptor *tx, struct dmaengine_unmap_data *unmap) { kref_get(&unmap->kref); tx->unmap = unmap; } struct dmaengine_unmap_data * dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags); void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap); #else static inline void dma_set_unmap(struct dma_async_tx_descriptor *tx, struct dmaengine_unmap_data *unmap) { } static inline struct dmaengine_unmap_data * dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags) { return NULL; } static inline void dmaengine_unmap_put(struct dmaengine_unmap_data *unmap) { } #endif static inline void dma_descriptor_unmap(struct dma_async_tx_descriptor *tx) { if (!tx->unmap) return; dmaengine_unmap_put(tx->unmap); tx->unmap = NULL; } #ifndef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH static inline void txd_lock(struct dma_async_tx_descriptor *txd) { } static inline void txd_unlock(struct dma_async_tx_descriptor *txd) { } static inline void txd_chain(struct dma_async_tx_descriptor *txd, struct dma_async_tx_descriptor *next) { BUG(); } static inline void txd_clear_parent(struct dma_async_tx_descriptor *txd) { } static inline void txd_clear_next(struct dma_async_tx_descriptor *txd) { } static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descriptor *txd) { return NULL; } static inline struct dma_async_tx_descriptor *txd_parent(struct dma_async_tx_descriptor *txd) { return NULL; } #else static inline void txd_lock(struct dma_async_tx_descriptor *txd) { spin_lock_bh(&txd->lock); } static inline void txd_unlock(struct dma_async_tx_descriptor *txd) { spin_unlock_bh(&txd->lock); } static inline void txd_chain(struct dma_async_tx_descriptor *txd, struct dma_async_tx_descriptor *next) { txd->next = next; next->parent = txd; } static inline void txd_clear_parent(struct dma_async_tx_descriptor *txd) { txd->parent = NULL; } static inline void txd_clear_next(struct dma_async_tx_descriptor *txd) { txd->next = NULL; } static inline struct dma_async_tx_descriptor *txd_parent(struct dma_async_tx_descriptor *txd) { return txd->parent; } static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descriptor *txd) { return txd->next; } #endif /** * struct dma_tx_state - filled in to report the status of * a transfer. * @last: last completed DMA cookie * @used: last issued DMA cookie (i.e. the one in progress) * @residue: the remaining number of bytes left to transmit * on the selected transfer for states DMA_IN_PROGRESS and * DMA_PAUSED if this is implemented in the driver, else 0 * @in_flight_bytes: amount of data in bytes cached by the DMA. */ struct dma_tx_state { dma_cookie_t last; dma_cookie_t used; u32 residue; u32 in_flight_bytes; }; /** * enum dmaengine_alignment - defines alignment of the DMA async tx * buffers */ enum dmaengine_alignment { DMAENGINE_ALIGN_1_BYTE = 0, DMAENGINE_ALIGN_2_BYTES = 1, DMAENGINE_ALIGN_4_BYTES = 2, DMAENGINE_ALIGN_8_BYTES = 3, DMAENGINE_ALIGN_16_BYTES = 4, DMAENGINE_ALIGN_32_BYTES = 5, DMAENGINE_ALIGN_64_BYTES = 6, DMAENGINE_ALIGN_128_BYTES = 7, DMAENGINE_ALIGN_256_BYTES = 8, }; /** * struct dma_slave_map - associates slave device and it's slave channel with * parameter to be used by a filter function * @devname: name of the device * @slave: slave channel name * @param: opaque parameter to pass to struct dma_filter.fn */ struct dma_slave_map { const char *devname; const char *slave; void *param; }; /** * struct dma_filter - information for slave device/channel to filter_fn/param * mapping * @fn: filter function callback * @mapcnt: number of slave device/channel in the map * @map: array of channel to filter mapping data */ struct dma_filter { dma_filter_fn fn; int mapcnt; const struct dma_slave_map *map; }; /** * struct dma_device - info on the entity supplying DMA services * @ref: reference is taken and put every time a channel is allocated or freed * @chancnt: how many DMA channels are supported * @privatecnt: how many DMA channels are requested by dma_request_channel * @channels: the list of struct dma_chan * @global_node: list_head for global dma_device_list * @filter: information for device/slave to filter function/param mapping * @cap_mask: one or more dma_capability flags * @desc_metadata_modes: supported metadata modes by the DMA device * @max_xor: maximum number of xor sources, 0 if no capability * @max_pq: maximum number of PQ sources and PQ-continue capability * @copy_align: alignment shift for memcpy operations * @xor_align: alignment shift for xor operations * @pq_align: alignment shift for pq operations * @fill_align: alignment shift for memset operations * @dev_id: unique device ID * @dev: struct device reference for dma mapping api * @owner: owner module (automatically set based on the provided dev) * @chan_ida: unique channel ID * @src_addr_widths: bit mask of src addr widths the device supports * Width is specified in bytes, e.g. for a device supporting * a width of 4 the mask should have BIT(4) set. * @dst_addr_widths: bit mask of dst addr widths the device supports * @directions: bit mask of slave directions the device supports. * Since the enum dma_transfer_direction is not defined as bit flag for * each type, the dma controller should set BIT(<TYPE>) and same * should be checked by controller as well * @min_burst: min burst capability per-transfer * @max_burst: max burst capability per-transfer * @max_sg_burst: max number of SG list entries executed in a single burst * DMA tansaction with no software intervention for reinitialization. * Zero value means unlimited number of entries. * @descriptor_reuse: a submitted transfer can be resubmitted after completion * @residue_granularity: granularity of the transfer residue reported * by tx_status * @device_alloc_chan_resources: allocate resources and return the * number of allocated descriptors * @device_router_config: optional callback for DMA router configuration * @device_free_chan_resources: release DMA channel's resources * @device_prep_dma_memcpy: prepares a memcpy operation * @device_prep_dma_xor: prepares a xor operation * @device_prep_dma_xor_val: prepares a xor validation operation * @device_prep_dma_pq: prepares a pq operation * @device_prep_dma_pq_val: prepares a pqzero_sum operation * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, * where the address and size of each segment is located in one entry of * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will * be called after period_len bytes have been transferred. * @device_prep_interleaved_dma: Transfer expression in a generic way. * @device_caps: May be used to override the generic DMA slave capabilities * with per-channel specific ones * @device_config: Pushes a new configuration to a channel, return 0 or an error * code * @device_pause: Pauses any transfer happening on a channel. Returns * 0 or an error code * @device_resume: Resumes any transfer on a channel previously * paused. Returns 0 or an error code * @device_terminate_all: Aborts all transfers on a channel. Returns 0 * or an error code * @device_synchronize: Synchronizes the termination of a transfers to the * current context. * @device_tx_status: poll for transaction completion, the optional * txstate parameter can be supplied with a pointer to get a * struct with auxiliary transfer status information, otherwise the call * will just return a simple status code * @device_issue_pending: push pending transactions to hardware * @device_release: called sometime atfer dma_async_device_unregister() is * called and there are no further references to this structure. This * must be implemented to free resources however many existing drivers * do not and are therefore not safe to unbind while in use. * @dbg_summary_show: optional routine to show contents in debugfs; default code * will be used when this is omitted, but custom code can show extra, * controller specific information. * @dbg_dev_root: the root folder in debugfs for this device */ struct dma_device { struct kref ref; unsigned int chancnt; unsigned int privatecnt; struct list_head channels; struct list_head global_node; struct dma_filter filter; dma_cap_mask_t cap_mask; enum dma_desc_metadata_mode desc_metadata_modes; unsigned short max_xor; unsigned short max_pq; enum dmaengine_alignment copy_align; enum dmaengine_alignment xor_align; enum dmaengine_alignment pq_align; enum dmaengine_alignment fill_align; #define DMA_HAS_PQ_CONTINUE (1 << 15) int dev_id; struct device *dev; struct module *owner; struct ida chan_ida; u32 src_addr_widths; u32 dst_addr_widths; u32 directions; u32 min_burst; u32 max_burst; u32 max_sg_burst; bool descriptor_reuse; enum dma_residue_granularity residue_granularity; int (*device_alloc_chan_resources)(struct dma_chan *chan); int (*device_router_config)(struct dma_chan *chan); void (*device_free_chan_resources)(struct dma_chan *chan); struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)( struct dma_chan *chan, dma_addr_t dst, dma_addr_t src, size_t len, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_xor)( struct dma_chan *chan, dma_addr_t dst, dma_addr_t *src, unsigned int src_cnt, size_t len, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_xor_val)( struct dma_chan *chan, dma_addr_t *src, unsigned int src_cnt, size_t len, enum sum_check_flags *result, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_pq)( struct dma_chan *chan, dma_addr_t *dst, dma_addr_t *src, unsigned int src_cnt, const unsigned char *scf, size_t len, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_pq_val)( struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src, unsigned int src_cnt, const unsigned char *scf, size_t len, enum sum_check_flags *pqres, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_memset)( struct dma_chan *chan, dma_addr_t dest, int value, size_t len, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_memset_sg)( struct dma_chan *chan, struct scatterlist *sg, unsigned int nents, int value, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)( struct dma_chan *chan, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_peripheral_dma_vec)( struct dma_chan *chan, const struct dma_vec *vecs, size_t nents, enum dma_transfer_direction direction, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_slave_sg)( struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_transfer_direction direction, unsigned long flags, void *context); struct dma_async_tx_descriptor *(*device_prep_dma_cyclic)( struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, enum dma_transfer_direction direction, unsigned long flags); struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)( struct dma_chan *chan, struct dma_interleaved_template *xt, unsigned long flags); void (*device_caps)(struct dma_chan *chan, struct dma_slave_caps *caps); int (*device_config)(struct dma_chan *chan, struct dma_slave_config *config); int (*device_pause)(struct dma_chan *chan); int (*device_resume)(struct dma_chan *chan); int (*device_terminate_all)(struct dma_chan *chan); void (*device_synchronize)(struct dma_chan *chan); enum dma_status (*device_tx_status)(struct dma_chan *chan, dma_cookie_t cookie, struct dma_tx_state *txstate); void (*device_issue_pending)(struct dma_chan *chan); void (*device_release)(struct dma_device *dev); /* debugfs support */ void (*dbg_summary_show)(struct seq_file *s, struct dma_device *dev); struct dentry *dbg_dev_root; }; static inline int dmaengine_slave_config(struct dma_chan *chan, struct dma_slave_config *config) { if (chan->device->device_config) return chan->device->device_config(chan, config); return -ENOSYS; } static inline bool is_slave_direction(enum dma_transfer_direction direction) { return (direction == DMA_MEM_TO_DEV) || (direction == DMA_DEV_TO_MEM) || (direction == DMA_DEV_TO_DEV); } static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single( struct dma_chan *chan, dma_addr_t buf, size_t len, enum dma_transfer_direction dir, unsigned long flags) { struct scatterlist sg; sg_init_table(&sg, 1); sg_dma_address(&sg) = buf; sg_dma_len(&sg) = len; if (!chan || !chan->device || !chan->device->device_prep_slave_sg) return NULL; return chan->device->device_prep_slave_sg(chan, &sg, 1, dir, flags, NULL); } /** * dmaengine_prep_peripheral_dma_vec() - Prepare a DMA scatter-gather descriptor * @chan: The channel to be used for this descriptor * @vecs: The array of DMA vectors that should be transferred * @nents: The number of DMA vectors in the array * @dir: Specifies the direction of the data transfer * @flags: DMA engine flags */ static inline struct dma_async_tx_descriptor *dmaengine_prep_peripheral_dma_vec( struct dma_chan *chan, const struct dma_vec *vecs, size_t nents, enum dma_transfer_direction dir, unsigned long flags) { if (!chan || !chan->device || !chan->device->device_prep_peripheral_dma_vec) return NULL; return chan->device->device_prep_peripheral_dma_vec(chan, vecs, nents, dir, flags); } static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_sg( struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_transfer_direction dir, unsigned long flags) { if (!chan || !chan->device || !chan->device->device_prep_slave_sg) return NULL; return chan->device->device_prep_slave_sg(chan, sgl, sg_len, dir, flags, NULL); } #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct rio_dma_ext; static inline struct dma_async_tx_descriptor *dmaengine_prep_rio_sg( struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_transfer_direction dir, unsigned long flags, struct rio_dma_ext *rio_ext) { if (!chan || !chan->device || !chan->device->device_prep_slave_sg) return NULL; return chan->device->device_prep_slave_sg(chan, sgl, sg_len, dir, flags, rio_ext); } #endif static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic( struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, enum dma_transfer_direction dir, unsigned long flags) { if (!chan || !chan->device || !chan->device->device_prep_dma_cyclic) return NULL; return chan->device->device_prep_dma_cyclic(chan, buf_addr, buf_len, period_len, dir, flags); } static inline struct dma_async_tx_descriptor *dmaengine_prep_interleaved_dma( struct dma_chan *chan, struct dma_interleaved_template *xt, unsigned long flags) { if (!chan || !chan->device || !chan->device->device_prep_interleaved_dma) return NULL; if (flags & DMA_PREP_REPEAT && !test_bit(DMA_REPEAT, chan->device->cap_mask.bits)) return NULL; return chan->device->device_prep_interleaved_dma(chan, xt, flags); } /** * dmaengine_prep_dma_memset() - Prepare a DMA memset descriptor. * @chan: The channel to be used for this descriptor * @dest: Address of buffer to be set * @value: Treated as a single byte value that fills the destination buffer * @len: The total size of dest * @flags: DMA engine flags */ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memset( struct dma_chan *chan, dma_addr_t dest, int value, size_t len, unsigned long flags) { if (!chan || !chan->device || !chan->device->device_prep_dma_memset) return NULL; return chan->device->device_prep_dma_memset(chan, dest, value, len, flags); } static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy( struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, size_t len, unsigned long flags) { if (!chan || !chan->device || !chan->device->device_prep_dma_memcpy) return NULL; return chan->device->device_prep_dma_memcpy(chan, dest, src, len, flags); } static inline bool dmaengine_is_metadata_mode_supported(struct dma_chan *chan, enum dma_desc_metadata_mode mode) { if (!chan) return false; return !!(chan->device->desc_metadata_modes & mode); } #ifdef CONFIG_DMA_ENGINE int dmaengine_desc_attach_metadata(struct dma_async_tx_descriptor *desc, void *data, size_t len); void *dmaengine_desc_get_metadata_ptr(struct dma_async_tx_descriptor *desc, size_t *payload_len, size_t *max_len); int dmaengine_desc_set_metadata_len(struct dma_async_tx_descriptor *desc, size_t payload_len); #else /* CONFIG_DMA_ENGINE */ static inline int dmaengine_desc_attach_metadata( struct dma_async_tx_descriptor *desc, void *data, size_t len) { return -EINVAL; } static inline void *dmaengine_desc_get_metadata_ptr( struct dma_async_tx_descriptor *desc, size_t *payload_len, size_t *max_len) { return NULL; } static inline int dmaengine_desc_set_metadata_len( struct dma_async_tx_descriptor *desc, size_t payload_len) { return -EINVAL; } #endif /* CONFIG_DMA_ENGINE */ /** * dmaengine_terminate_all() - Terminate all active DMA transfers * @chan: The channel for which to terminate the transfers * * This function is DEPRECATED use either dmaengine_terminate_sync() or * dmaengine_terminate_async() instead. */ static inline int dmaengine_terminate_all(struct dma_chan *chan) { if (chan->device->device_terminate_all) return chan->device->device_terminate_all(chan); return -ENOSYS; } /** * dmaengine_terminate_async() - Terminate all active DMA transfers * @chan: The channel for which to terminate the transfers * * Calling this function will terminate all active and pending descriptors * that have previously been submitted to the channel. It is not guaranteed * though that the transfer for the active descriptor has stopped when the * function returns. Furthermore it is possible the complete callback of a * submitted transfer is still running when this function returns. * * dmaengine_synchronize() needs to be called before it is safe to free * any memory that is accessed by previously submitted descriptors or before * freeing any resources accessed from within the completion callback of any * previously submitted descriptors. * * This function can be called from atomic context as well as from within a * complete callback of a descriptor submitted on the same channel. * * If none of the two conditions above apply consider using * dmaengine_terminate_sync() instead. */ static inline int dmaengine_terminate_async(struct dma_chan *chan) { if (chan->device->device_terminate_all) return chan->device->device_terminate_all(chan); return -EINVAL; } /** * dmaengine_synchronize() - Synchronize DMA channel termination * @chan: The channel to synchronize * * Synchronizes to the DMA channel termination to the current context. When this * function returns it is guaranteed that all transfers for previously issued * descriptors have stopped and it is safe to free the memory associated * with them. Furthermore it is guaranteed that all complete callback functions * for a previously submitted descriptor have finished running and it is safe to * free resources accessed from within the complete callbacks. * * The behavior of this function is undefined if dma_async_issue_pending() has * been called between dmaengine_terminate_async() and this function. * * This function must only be called from non-atomic context and must not be * called from within a complete callback of a descriptor submitted on the same * channel. */ static inline void dmaengine_synchronize(struct dma_chan *chan) { might_sleep(); if (chan->device->device_synchronize) chan->device->device_synchronize(chan); } /** * dmaengine_terminate_sync() - Terminate all active DMA transfers * @chan: The channel for which to terminate the transfers * * Calling this function will terminate all active and pending transfers * that have previously been submitted to the channel. It is similar to * dmaengine_terminate_async() but guarantees that the DMA transfer has actually * stopped and that all complete callbacks have finished running when the * function returns. * * This function must only be called from non-atomic context and must not be * called from within a complete callback of a descriptor submitted on the same * channel. */ static inline int dmaengine_terminate_sync(struct dma_chan *chan) { int ret; ret = dmaengine_terminate_async(chan); if (ret) return ret; dmaengine_synchronize(chan); return 0; } static inline int dmaengine_pause(struct dma_chan *chan) { if (chan->device->device_pause) return chan->device->device_pause(chan); return -ENOSYS; } static inline int dmaengine_resume(struct dma_chan *chan) { if (chan->device->device_resume) return chan->device->device_resume(chan); return -ENOSYS; } static inline enum dma_status dmaengine_tx_status(struct dma_chan *chan, dma_cookie_t cookie, struct dma_tx_state *state) { return chan->device->device_tx_status(chan, cookie, state); } static inline dma_cookie_t dmaengine_submit(struct dma_async_tx_descriptor *desc) { return desc->tx_submit(desc); } static inline bool dmaengine_check_align(enum dmaengine_alignment align, size_t off1, size_t off2, size_t len) { return !(((1 << align) - 1) & (off1 | off2 | len)); } static inline bool is_dma_copy_aligned(struct dma_device *dev, size_t off1, size_t off2, size_t len) { return dmaengine_check_align(dev->copy_align, off1, off2, len); } static inline bool is_dma_xor_aligned(struct dma_device *dev, size_t off1, size_t off2, size_t len) { return dmaengine_check_align(dev->xor_align, off1, off2, len); } static inline bool is_dma_pq_aligned(struct dma_device *dev, size_t off1, size_t off2, size_t len) { return dmaengine_check_align(dev->pq_align, off1, off2, len); } static inline bool is_dma_fill_aligned(struct dma_device *dev, size_t off1, size_t off2, size_t len) { return dmaengine_check_align(dev->fill_align, off1, off2, len); } static inline void dma_set_maxpq(struct dma_device *dma, int maxpq, int has_pq_continue) { dma->max_pq = maxpq; if (has_pq_continue) dma->max_pq |= DMA_HAS_PQ_CONTINUE; } static inline bool dmaf_continue(enum dma_ctrl_flags flags) { return (flags & DMA_PREP_CONTINUE) == DMA_PREP_CONTINUE; } static inline bool dmaf_p_disabled_continue(enum dma_ctrl_flags flags) { enum dma_ctrl_flags mask = DMA_PREP_CONTINUE | DMA_PREP_PQ_DISABLE_P; return (flags & mask) == mask; } static inline bool dma_dev_has_pq_continue(struct dma_device *dma) { return (dma->max_pq & DMA_HAS_PQ_CONTINUE) == DMA_HAS_PQ_CONTINUE; } static inline unsigned short dma_dev_to_maxpq(struct dma_device *dma) { return dma->max_pq & ~DMA_HAS_PQ_CONTINUE; } /* dma_maxpq - reduce maxpq in the face of continued operations * @dma - dma device with PQ capability * @flags - to check if DMA_PREP_CONTINUE and DMA_PREP_PQ_DISABLE_P are set * * When an engine does not support native continuation we need 3 extra * source slots to reuse P and Q with the following coefficients: * 1/ {00} * P : remove P from Q', but use it as a source for P' * 2/ {01} * Q : use Q to continue Q' calculation * 3/ {00} * Q : subtract Q from P' to cancel (2) * * In the case where P is disabled we only need 1 extra source: * 1/ {01} * Q : use Q to continue Q' calculation */ static inline int dma_maxpq(struct dma_device *dma, enum dma_ctrl_flags flags) { if (dma_dev_has_pq_continue(dma) || !dmaf_continue(flags)) return dma_dev_to_maxpq(dma); if (dmaf_p_disabled_continue(flags)) return dma_dev_to_maxpq(dma) - 1; if (dmaf_continue(flags)) return dma_dev_to_maxpq(dma) - 3; BUG(); } static inline size_t dmaengine_get_icg(bool inc, bool sgl, size_t icg, size_t dir_icg) { if (inc) { if (dir_icg) return dir_icg; if (sgl) return icg; } return 0; } static inline size_t dmaengine_get_dst_icg(struct dma_interleaved_template *xt, struct data_chunk *chunk) { return dmaengine_get_icg(xt->dst_inc, xt->dst_sgl, chunk->icg, chunk->dst_icg); } static inline size_t dmaengine_get_src_icg(struct dma_interleaved_template *xt, struct data_chunk *chunk) { return dmaengine_get_icg(xt->src_inc, xt->src_sgl, chunk->icg, chunk->src_icg); } /* --- public DMA engine API --- */ #ifdef CONFIG_DMA_ENGINE void dmaengine_get(void); void dmaengine_put(void); #else static inline void dmaengine_get(void) { } static inline void dmaengine_put(void) { } #endif #ifdef CONFIG_ASYNC_TX_DMA #define async_dmaengine_get() dmaengine_get() #define async_dmaengine_put() dmaengine_put() #ifndef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH #define async_dma_find_channel(type) dma_find_channel(DMA_ASYNC_TX) #else #define async_dma_find_channel(type) dma_find_channel(type) #endif /* CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH */ #else static inline void async_dmaengine_get(void) { } static inline void async_dmaengine_put(void) { } static inline struct dma_chan * async_dma_find_channel(enum dma_transaction_type type) { return NULL; } #endif /* CONFIG_ASYNC_TX_DMA */ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, struct dma_chan *chan); static inline void async_tx_ack(struct dma_async_tx_descriptor *tx) { tx->flags |= DMA_CTRL_ACK; } static inline void async_tx_clear_ack(struct dma_async_tx_descriptor *tx) { tx->flags &= ~DMA_CTRL_ACK; } static inline bool async_tx_test_ack(struct dma_async_tx_descriptor *tx) { return (tx->flags & DMA_CTRL_ACK) == DMA_CTRL_ACK; } #define dma_cap_set(tx, mask) __dma_cap_set((tx), &(mask)) static inline void __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp) { set_bit(tx_type, dstp->bits); } #define dma_cap_clear(tx, mask) __dma_cap_clear((tx), &(mask)) static inline void __dma_cap_clear(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp) { clear_bit(tx_type, dstp->bits); } #define dma_cap_zero(mask) __dma_cap_zero(&(mask)) static inline void __dma_cap_zero(dma_cap_mask_t *dstp) { bitmap_zero(dstp->bits, DMA_TX_TYPE_END); } #define dma_has_cap(tx, mask) __dma_has_cap((tx), &(mask)) static inline int __dma_has_cap(enum dma_transaction_type tx_type, dma_cap_mask_t *srcp) { return test_bit(tx_type, srcp->bits); } #define for_each_dma_cap_mask(cap, mask) \ for_each_set_bit(cap, mask.bits, DMA_TX_TYPE_END) /** * dma_async_issue_pending - flush pending transactions to HW * @chan: target DMA channel * * This allows drivers to push copies to HW in batches, * reducing MMIO writes where possible. */ static inline void dma_async_issue_pending(struct dma_chan *chan) { chan->device->device_issue_pending(chan); } /** * dma_async_is_tx_complete - poll for transaction completion * @chan: DMA channel * @cookie: transaction identifier to check status of * @last: returns last completed cookie, can be NULL * @used: returns last issued cookie, can be NULL * * If @last and @used are passed in, upon return they reflect the driver * internal state and can be used with dma_async_is_complete() to check * the status of multiple cookies without re-checking hardware state. */ static inline enum dma_status dma_async_is_tx_complete(struct dma_chan *chan, dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used) { struct dma_tx_state state; enum dma_status status; status = chan->device->device_tx_status(chan, cookie, &state); if (last) *last = state.last; if (used) *used = state.used; return status; } /** * dma_async_is_complete - test a cookie against chan state * @cookie: transaction identifier to test status of * @last_complete: last know completed transaction * @last_used: last cookie value handed out * * dma_async_is_complete() is used in dma_async_is_tx_complete() * the test logic is separated for lightweight testing of multiple cookies */ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie, dma_cookie_t last_complete, dma_cookie_t last_used) { if (last_complete <= last_used) { if ((cookie <= last_complete) || (cookie > last_used)) return DMA_COMPLETE; } else { if ((cookie <= last_complete) && (cookie > last_used)) return DMA_COMPLETE; } return DMA_IN_PROGRESS; } static inline void dma_set_tx_state(struct dma_tx_state *st, dma_cookie_t last, dma_cookie_t used, u32 residue) { if (!st) return; st->last = last; st->used = used; st->residue = residue; } #ifdef CONFIG_DMA_ENGINE struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type); enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie); enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx); void dma_issue_pending_all(void); struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param, struct device_node *np); struct dma_chan *dma_request_chan(struct device *dev, const char *name); struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask); void dma_release_channel(struct dma_chan *chan); int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps); #else static inline struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type) { return NULL; } static inline enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie) { return DMA_COMPLETE; } static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) { return DMA_COMPLETE; } static inline void dma_issue_pending_all(void) { } static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param, struct device_node *np) { return NULL; } static inline struct dma_chan *dma_request_chan(struct device *dev, const char *name) { return ERR_PTR(-ENODEV); } static inline struct dma_chan *dma_request_chan_by_mask( const dma_cap_mask_t *mask) { return ERR_PTR(-ENODEV); } static inline void dma_release_channel(struct dma_chan *chan) { } static inline int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps) { return -ENXIO; } #endif static inline int dmaengine_desc_set_reuse(struct dma_async_tx_descriptor *tx) { struct dma_slave_caps caps; int ret; ret = dma_get_slave_caps(tx->chan, &caps); if (ret) return ret; if (!caps.descriptor_reuse) return -EPERM; tx->flags |= DMA_CTRL_REUSE; return 0; } static inline void dmaengine_desc_clear_reuse(struct dma_async_tx_descriptor *tx) { tx->flags &= ~DMA_CTRL_REUSE; } static inline bool dmaengine_desc_test_reuse(struct dma_async_tx_descriptor *tx) { return (tx->flags & DMA_CTRL_REUSE) == DMA_CTRL_REUSE; } static inline int dmaengine_desc_free(struct dma_async_tx_descriptor *desc) { /* this is supported for reusable desc, so check that */ if (!dmaengine_desc_test_reuse(desc)) return -EPERM; return desc->desc_free(desc); } /* --- DMA device --- */ int dma_async_device_register(struct dma_device *device); int dmaenginem_async_device_register(struct dma_device *device); void dma_async_device_unregister(struct dma_device *device); int dma_async_device_channel_register(struct dma_device *device, struct dma_chan *chan, const char *name); void dma_async_device_channel_unregister(struct dma_device *device, struct dma_chan *chan); void dma_run_dependencies(struct dma_async_tx_descriptor *tx); #define dma_request_channel(mask, x, y) \ __dma_request_channel(&(mask), x, y, NULL) /* Deprecated, please use dma_request_chan() directly */ static inline struct dma_chan * __deprecated dma_request_slave_channel(struct device *dev, const char *name) { struct dma_chan *ch = dma_request_chan(dev, name); return IS_ERR(ch) ? NULL : ch; } static inline struct dma_chan *dma_request_slave_channel_compat(const dma_cap_mask_t mask, dma_filter_fn fn, void *fn_param, struct device *dev, const char *name) { struct dma_chan *chan; chan = dma_request_chan(dev, name); if (!IS_ERR(chan)) return chan; if (!fn || !fn_param) return NULL; return dma_request_channel(mask, fn, fn_param); } static inline char * dmaengine_get_direction_text(enum dma_transfer_direction dir) { switch (dir) { case DMA_DEV_TO_MEM: return "DEV_TO_MEM"; case DMA_MEM_TO_DEV: return "MEM_TO_DEV"; case DMA_MEM_TO_MEM: return "MEM_TO_MEM"; case DMA_DEV_TO_DEV: return "DEV_TO_DEV"; default: return "invalid"; } } static inline struct device *dmaengine_get_dma_device(struct dma_chan *chan) { if (chan->dev->chan_dma_dev) return &chan->dev->device; return chan->device->dev; } #endif /* DMAENGINE_H */ |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 | /* * edac_mc kernel module * (C) 2005, 2006 Linux Networx (http://lnxi.com) * This file may be distributed under the terms of the * GNU General Public License. * * Written by Thayne Harbaugh * Based on work by Dan Hollis <goemon at anime dot net> and others. * http://www.anime.net/~goemon/linux-ecc/ * * Modified by Dave Peterson and Doug Thompson * */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/highmem.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/ctype.h> #include <linux/edac.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <asm/page.h> #include "edac_mc.h" #include "edac_module.h" #include <ras/ras_event.h> #ifdef CONFIG_EDAC_ATOMIC_SCRUB #include <asm/edac.h> #else #define edac_atomic_scrub(va, size) do { } while (0) #endif int edac_op_state = EDAC_OPSTATE_INVAL; EXPORT_SYMBOL_GPL(edac_op_state); /* lock to memory controller's control array */ static DEFINE_MUTEX(mem_ctls_mutex); static LIST_HEAD(mc_devices); /* * Used to lock EDAC MC to just one module, avoiding two drivers e. g. * apei/ghes and i7core_edac to be used at the same time. */ static const char *edac_mc_owner; static struct mem_ctl_info *error_desc_to_mci(struct edac_raw_error_desc *e) { return container_of(e, struct mem_ctl_info, error_desc); } unsigned int edac_dimm_info_location(struct dimm_info *dimm, char *buf, unsigned int len) { struct mem_ctl_info *mci = dimm->mci; int i, n, count = 0; char *p = buf; for (i = 0; i < mci->n_layers; i++) { n = scnprintf(p, len, "%s %d ", edac_layer_name[mci->layers[i].type], dimm->location[i]); p += n; len -= n; count += n; } return count; } #ifdef CONFIG_EDAC_DEBUG static void edac_mc_dump_channel(struct rank_info *chan) { edac_dbg(4, " channel->chan_idx = %d\n", chan->chan_idx); edac_dbg(4, " channel = %p\n", chan); edac_dbg(4, " channel->csrow = %p\n", chan->csrow); edac_dbg(4, " channel->dimm = %p\n", chan->dimm); } static void edac_mc_dump_dimm(struct dimm_info *dimm) { char location[80]; if (!dimm->nr_pages) return; edac_dimm_info_location(dimm, location, sizeof(location)); edac_dbg(4, "%s%i: %smapped as virtual row %d, chan %d\n", dimm->mci->csbased ? "rank" : "dimm", dimm->idx, location, dimm->csrow, dimm->cschannel); edac_dbg(4, " dimm = %p\n", dimm); edac_dbg(4, " dimm->label = '%s'\n", dimm->label); edac_dbg(4, " dimm->nr_pages = 0x%x\n", dimm->nr_pages); edac_dbg(4, " dimm->grain = %d\n", dimm->grain); } static void edac_mc_dump_csrow(struct csrow_info *csrow) { edac_dbg(4, "csrow->csrow_idx = %d\n", csrow->csrow_idx); edac_dbg(4, " csrow = %p\n", csrow); edac_dbg(4, " csrow->first_page = 0x%lx\n", csrow->first_page); edac_dbg(4, " csrow->last_page = 0x%lx\n", csrow->last_page); edac_dbg(4, " csrow->page_mask = 0x%lx\n", csrow->page_mask); edac_dbg(4, " csrow->nr_channels = %d\n", csrow->nr_channels); edac_dbg(4, " csrow->channels = %p\n", csrow->channels); edac_dbg(4, " csrow->mci = %p\n", csrow->mci); } static void edac_mc_dump_mci(struct mem_ctl_info *mci) { edac_dbg(3, "\tmci = %p\n", mci); edac_dbg(3, "\tmci->mtype_cap = %lx\n", mci->mtype_cap); edac_dbg(3, "\tmci->edac_ctl_cap = %lx\n", mci->edac_ctl_cap); edac_dbg(3, "\tmci->edac_cap = %lx\n", mci->edac_cap); edac_dbg(4, "\tmci->edac_check = %p\n", mci->edac_check); edac_dbg(3, "\tmci->nr_csrows = %d, csrows = %p\n", mci->nr_csrows, mci->csrows); edac_dbg(3, "\tmci->nr_dimms = %d, dimms = %p\n", mci->tot_dimms, mci->dimms); edac_dbg(3, "\tdev = %p\n", mci->pdev); edac_dbg(3, "\tmod_name:ctl_name = %s:%s\n", mci->mod_name, mci->ctl_name); edac_dbg(3, "\tpvt_info = %p\n\n", mci->pvt_info); } #endif /* CONFIG_EDAC_DEBUG */ const char * const edac_mem_types[] = { [MEM_EMPTY] = "Empty", [MEM_RESERVED] = "Reserved", [MEM_UNKNOWN] = "Unknown", [MEM_FPM] = "FPM", [MEM_EDO] = "EDO", [MEM_BEDO] = "BEDO", [MEM_SDR] = "Unbuffered-SDR", [MEM_RDR] = "Registered-SDR", [MEM_DDR] = "Unbuffered-DDR", [MEM_RDDR] = "Registered-DDR", [MEM_RMBS] = "RMBS", [MEM_DDR2] = "Unbuffered-DDR2", [MEM_FB_DDR2] = "FullyBuffered-DDR2", [MEM_RDDR2] = "Registered-DDR2", [MEM_XDR] = "XDR", [MEM_DDR3] = "Unbuffered-DDR3", [MEM_RDDR3] = "Registered-DDR3", [MEM_LRDDR3] = "Load-Reduced-DDR3-RAM", [MEM_LPDDR3] = "Low-Power-DDR3-RAM", [MEM_DDR4] = "Unbuffered-DDR4", [MEM_RDDR4] = "Registered-DDR4", [MEM_LPDDR4] = "Low-Power-DDR4-RAM", [MEM_LRDDR4] = "Load-Reduced-DDR4-RAM", [MEM_DDR5] = "Unbuffered-DDR5", [MEM_RDDR5] = "Registered-DDR5", [MEM_LRDDR5] = "Load-Reduced-DDR5-RAM", [MEM_NVDIMM] = "Non-volatile-RAM", [MEM_WIO2] = "Wide-IO-2", [MEM_HBM2] = "High-bandwidth-memory-Gen2", [MEM_HBM3] = "High-bandwidth-memory-Gen3", }; EXPORT_SYMBOL_GPL(edac_mem_types); static void _edac_mc_free(struct mem_ctl_info *mci) { put_device(&mci->dev); } static void mci_release(struct device *dev) { struct mem_ctl_info *mci = container_of(dev, struct mem_ctl_info, dev); struct csrow_info *csr; int i, chn, row; if (mci->dimms) { for (i = 0; i < mci->tot_dimms; i++) kfree(mci->dimms[i]); kfree(mci->dimms); } if (mci->csrows) { for (row = 0; row < mci->nr_csrows; row++) { csr = mci->csrows[row]; if (!csr) continue; if (csr->channels) { for (chn = 0; chn < mci->num_cschannel; chn++) kfree(csr->channels[chn]); kfree(csr->channels); } kfree(csr); } kfree(mci->csrows); } kfree(mci->pvt_info); kfree(mci->layers); kfree(mci); } static int edac_mc_alloc_csrows(struct mem_ctl_info *mci) { unsigned int tot_channels = mci->num_cschannel; unsigned int tot_csrows = mci->nr_csrows; unsigned int row, chn; /* * Allocate and fill the csrow/channels structs */ mci->csrows = kcalloc(tot_csrows, sizeof(*mci->csrows), GFP_KERNEL); if (!mci->csrows) return -ENOMEM; for (row = 0; row < tot_csrows; row++) { struct csrow_info *csr; csr = kzalloc(sizeof(**mci->csrows), GFP_KERNEL); if (!csr) return -ENOMEM; mci->csrows[row] = csr; csr->csrow_idx = row; csr->mci = mci; csr->nr_channels = tot_channels; csr->channels = kcalloc(tot_channels, sizeof(*csr->channels), GFP_KERNEL); if (!csr->channels) return -ENOMEM; for (chn = 0; chn < tot_channels; chn++) { struct rank_info *chan; chan = kzalloc(sizeof(**csr->channels), GFP_KERNEL); if (!chan) return -ENOMEM; csr->channels[chn] = chan; chan->chan_idx = chn; chan->csrow = csr; } } return 0; } static int edac_mc_alloc_dimms(struct mem_ctl_info *mci) { unsigned int pos[EDAC_MAX_LAYERS]; unsigned int row, chn, idx; int layer; void *p; /* * Allocate and fill the dimm structs */ mci->dimms = kcalloc(mci->tot_dimms, sizeof(*mci->dimms), GFP_KERNEL); if (!mci->dimms) return -ENOMEM; memset(&pos, 0, sizeof(pos)); row = 0; chn = 0; for (idx = 0; idx < mci->tot_dimms; idx++) { struct dimm_info *dimm; struct rank_info *chan; int n, len; chan = mci->csrows[row]->channels[chn]; dimm = kzalloc(sizeof(**mci->dimms), GFP_KERNEL); if (!dimm) return -ENOMEM; mci->dimms[idx] = dimm; dimm->mci = mci; dimm->idx = idx; /* * Copy DIMM location and initialize it. */ len = sizeof(dimm->label); p = dimm->label; n = scnprintf(p, len, "mc#%u", mci->mc_idx); p += n; len -= n; for (layer = 0; layer < mci->n_layers; layer++) { n = scnprintf(p, len, "%s#%u", edac_layer_name[mci->layers[layer].type], pos[layer]); p += n; len -= n; dimm->location[layer] = pos[layer]; } /* Link it to the csrows old API data */ chan->dimm = dimm; dimm->csrow = row; dimm->cschannel = chn; /* Increment csrow location */ if (mci->layers[0].is_virt_csrow) { chn++; if (chn == mci->num_cschannel) { chn = 0; row++; } } else { row++; if (row == mci->nr_csrows) { row = 0; chn++; } } /* Increment dimm location */ for (layer = mci->n_layers - 1; layer >= 0; layer--) { pos[layer]++; if (pos[layer] < mci->layers[layer].size) break; pos[layer] = 0; } } return 0; } struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, unsigned int n_layers, struct edac_mc_layer *layers, unsigned int sz_pvt) { struct mem_ctl_info *mci; struct edac_mc_layer *layer; unsigned int idx, tot_dimms = 1; unsigned int tot_csrows = 1, tot_channels = 1; bool per_rank = false; if (WARN_ON(n_layers > EDAC_MAX_LAYERS || n_layers == 0)) return NULL; /* * Calculate the total amount of dimms and csrows/cschannels while * in the old API emulation mode */ for (idx = 0; idx < n_layers; idx++) { tot_dimms *= layers[idx].size; if (layers[idx].is_virt_csrow) tot_csrows *= layers[idx].size; else tot_channels *= layers[idx].size; if (layers[idx].type == EDAC_MC_LAYER_CHIP_SELECT) per_rank = true; } mci = kzalloc(sizeof(struct mem_ctl_info), GFP_KERNEL); if (!mci) return NULL; mci->layers = kcalloc(n_layers, sizeof(struct edac_mc_layer), GFP_KERNEL); if (!mci->layers) goto error; mci->pvt_info = kzalloc(sz_pvt, GFP_KERNEL); if (!mci->pvt_info) goto error; mci->dev.release = mci_release; device_initialize(&mci->dev); /* setup index and various internal pointers */ mci->mc_idx = mc_num; mci->tot_dimms = tot_dimms; mci->n_layers = n_layers; memcpy(mci->layers, layers, sizeof(*layer) * n_layers); mci->nr_csrows = tot_csrows; mci->num_cschannel = tot_channels; mci->csbased = per_rank; if (edac_mc_alloc_csrows(mci)) goto error; if (edac_mc_alloc_dimms(mci)) goto error; mci->op_state = OP_ALLOC; return mci; error: _edac_mc_free(mci); return NULL; } EXPORT_SYMBOL_GPL(edac_mc_alloc); void edac_mc_free(struct mem_ctl_info *mci) { edac_dbg(1, "\n"); _edac_mc_free(mci); } EXPORT_SYMBOL_GPL(edac_mc_free); bool edac_has_mcs(void) { bool ret; mutex_lock(&mem_ctls_mutex); ret = list_empty(&mc_devices); mutex_unlock(&mem_ctls_mutex); return !ret; } EXPORT_SYMBOL_GPL(edac_has_mcs); /* Caller must hold mem_ctls_mutex */ static struct mem_ctl_info *__find_mci_by_dev(struct device *dev) { struct mem_ctl_info *mci; struct list_head *item; edac_dbg(3, "\n"); list_for_each(item, &mc_devices) { mci = list_entry(item, struct mem_ctl_info, link); if (mci->pdev == dev) return mci; } return NULL; } /** * find_mci_by_dev * * scan list of controllers looking for the one that manages * the 'dev' device * @dev: pointer to a struct device related with the MCI */ struct mem_ctl_info *find_mci_by_dev(struct device *dev) { struct mem_ctl_info *ret; mutex_lock(&mem_ctls_mutex); ret = __find_mci_by_dev(dev); mutex_unlock(&mem_ctls_mutex); return ret; } EXPORT_SYMBOL_GPL(find_mci_by_dev); /* * edac_mc_workq_function * performs the operation scheduled by a workq request */ static void edac_mc_workq_function(struct work_struct *work_req) { struct delayed_work *d_work = to_delayed_work(work_req); struct mem_ctl_info *mci = to_edac_mem_ctl_work(d_work); mutex_lock(&mem_ctls_mutex); if (mci->op_state != OP_RUNNING_POLL) { mutex_unlock(&mem_ctls_mutex); return; } if (edac_op_state == EDAC_OPSTATE_POLL) mci->edac_check(mci); mutex_unlock(&mem_ctls_mutex); /* Queue ourselves again. */ edac_queue_work(&mci->work, msecs_to_jiffies(edac_mc_get_poll_msec())); } /* * edac_mc_reset_delay_period(unsigned long value) * * user space has updated our poll period value, need to * reset our workq delays */ void edac_mc_reset_delay_period(unsigned long value) { struct mem_ctl_info *mci; struct list_head *item; mutex_lock(&mem_ctls_mutex); list_for_each(item, &mc_devices) { mci = list_entry(item, struct mem_ctl_info, link); if (mci->op_state == OP_RUNNING_POLL) edac_mod_work(&mci->work, value); } mutex_unlock(&mem_ctls_mutex); } /* Return 0 on success, 1 on failure. * Before calling this function, caller must * assign a unique value to mci->mc_idx. * * locking model: * * called with the mem_ctls_mutex lock held */ static int add_mc_to_global_list(struct mem_ctl_info *mci) { struct list_head *item, *insert_before; struct mem_ctl_info *p; insert_before = &mc_devices; p = __find_mci_by_dev(mci->pdev); if (unlikely(p != NULL)) goto fail0; list_for_each(item, &mc_devices) { p = list_entry(item, struct mem_ctl_info, link); if (p->mc_idx >= mci->mc_idx) { if (unlikely(p->mc_idx == mci->mc_idx)) goto fail1; insert_before = item; break; } } list_add_tail_rcu(&mci->link, insert_before); return 0; fail0: edac_printk(KERN_WARNING, EDAC_MC, "%s (%s) %s %s already assigned %d\n", dev_name(p->pdev), edac_dev_name(mci), p->mod_name, p->ctl_name, p->mc_idx); return 1; fail1: edac_printk(KERN_WARNING, EDAC_MC, "bug in low-level driver: attempt to assign\n" " duplicate mc_idx %d in %s()\n", p->mc_idx, __func__); return 1; } static int del_mc_from_global_list(struct mem_ctl_info *mci) { list_del_rcu(&mci->link); /* these are for safe removal of devices from global list while * NMI handlers may be traversing list */ synchronize_rcu(); INIT_LIST_HEAD(&mci->link); return list_empty(&mc_devices); } struct mem_ctl_info *edac_mc_find(int idx) { struct mem_ctl_info *mci; struct list_head *item; mutex_lock(&mem_ctls_mutex); list_for_each(item, &mc_devices) { mci = list_entry(item, struct mem_ctl_info, link); if (mci->mc_idx == idx) goto unlock; } mci = NULL; unlock: mutex_unlock(&mem_ctls_mutex); return mci; } EXPORT_SYMBOL(edac_mc_find); const char *edac_get_owner(void) { return edac_mc_owner; } EXPORT_SYMBOL_GPL(edac_get_owner); /* FIXME - should a warning be printed if no error detection? correction? */ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, const struct attribute_group **groups) { int ret = -EINVAL; edac_dbg(0, "\n"); #ifdef CONFIG_EDAC_DEBUG if (edac_debug_level >= 3) edac_mc_dump_mci(mci); if (edac_debug_level >= 4) { struct dimm_info *dimm; int i; for (i = 0; i < mci->nr_csrows; i++) { struct csrow_info *csrow = mci->csrows[i]; u32 nr_pages = 0; int j; for (j = 0; j < csrow->nr_channels; j++) nr_pages += csrow->channels[j]->dimm->nr_pages; if (!nr_pages) continue; edac_mc_dump_csrow(csrow); for (j = 0; j < csrow->nr_channels; j++) if (csrow->channels[j]->dimm->nr_pages) edac_mc_dump_channel(csrow->channels[j]); } mci_for_each_dimm(mci, dimm) edac_mc_dump_dimm(dimm); } #endif mutex_lock(&mem_ctls_mutex); if (edac_mc_owner && edac_mc_owner != mci->mod_name) { ret = -EPERM; goto fail0; } if (add_mc_to_global_list(mci)) goto fail0; /* set load time so that error rate can be tracked */ mci->start_time = jiffies; mci->bus = edac_get_sysfs_subsys(); if (edac_create_sysfs_mci_device(mci, groups)) { edac_mc_printk(mci, KERN_WARNING, "failed to create sysfs device\n"); goto fail1; } if (mci->edac_check) { mci->op_state = OP_RUNNING_POLL; INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); edac_queue_work(&mci->work, msecs_to_jiffies(edac_mc_get_poll_msec())); } else { mci->op_state = OP_RUNNING_INTERRUPT; } /* Report action taken */ edac_mc_printk(mci, KERN_INFO, "Giving out device to module %s controller %s: DEV %s (%s)\n", mci->mod_name, mci->ctl_name, mci->dev_name, edac_op_state_to_string(mci->op_state)); edac_mc_owner = mci->mod_name; mutex_unlock(&mem_ctls_mutex); return 0; fail1: del_mc_from_global_list(mci); fail0: mutex_unlock(&mem_ctls_mutex); return ret; } EXPORT_SYMBOL_GPL(edac_mc_add_mc_with_groups); struct mem_ctl_info *edac_mc_del_mc(struct device *dev) { struct mem_ctl_info *mci; edac_dbg(0, "\n"); mutex_lock(&mem_ctls_mutex); /* find the requested mci struct in the global list */ mci = __find_mci_by_dev(dev); if (mci == NULL) { mutex_unlock(&mem_ctls_mutex); return NULL; } /* mark MCI offline: */ mci->op_state = OP_OFFLINE; if (del_mc_from_global_list(mci)) edac_mc_owner = NULL; mutex_unlock(&mem_ctls_mutex); if (mci->edac_check) edac_stop_work(&mci->work); /* remove from sysfs */ edac_remove_sysfs_mci_device(mci); edac_printk(KERN_INFO, EDAC_MC, "Removed device %d for %s %s: DEV %s\n", mci->mc_idx, mci->mod_name, mci->ctl_name, edac_dev_name(mci)); return mci; } EXPORT_SYMBOL_GPL(edac_mc_del_mc); static void edac_mc_scrub_block(unsigned long page, unsigned long offset, u32 size) { struct page *pg; void *virt_addr; unsigned long flags = 0; edac_dbg(3, "\n"); /* ECC error page was not in our memory. Ignore it. */ if (!pfn_valid(page)) return; /* Find the actual page structure then map it and fix */ pg = pfn_to_page(page); if (PageHighMem(pg)) local_irq_save(flags); virt_addr = kmap_atomic(pg); /* Perform architecture specific atomic scrub operation */ edac_atomic_scrub(virt_addr + offset, size); /* Unmap and complete */ kunmap_atomic(virt_addr); if (PageHighMem(pg)) local_irq_restore(flags); } /* FIXME - should return -1 */ int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, unsigned long page) { struct csrow_info **csrows = mci->csrows; int row, i, j, n; edac_dbg(1, "MC%d: 0x%lx\n", mci->mc_idx, page); row = -1; for (i = 0; i < mci->nr_csrows; i++) { struct csrow_info *csrow = csrows[i]; n = 0; for (j = 0; j < csrow->nr_channels; j++) { struct dimm_info *dimm = csrow->channels[j]->dimm; n += dimm->nr_pages; } if (n == 0) continue; edac_dbg(3, "MC%d: first(0x%lx) page(0x%lx) last(0x%lx) mask(0x%lx)\n", mci->mc_idx, csrow->first_page, page, csrow->last_page, csrow->page_mask); if ((page >= csrow->first_page) && (page <= csrow->last_page) && ((page & csrow->page_mask) == (csrow->first_page & csrow->page_mask))) { row = i; break; } } if (row == -1) edac_mc_printk(mci, KERN_ERR, "could not look up page error address %lx\n", (unsigned long)page); return row; } EXPORT_SYMBOL_GPL(edac_mc_find_csrow_by_page); const char *edac_layer_name[] = { [EDAC_MC_LAYER_BRANCH] = "branch", [EDAC_MC_LAYER_CHANNEL] = "channel", [EDAC_MC_LAYER_SLOT] = "slot", [EDAC_MC_LAYER_CHIP_SELECT] = "csrow", [EDAC_MC_LAYER_ALL_MEM] = "memory", }; EXPORT_SYMBOL_GPL(edac_layer_name); static void edac_inc_ce_error(struct edac_raw_error_desc *e) { int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; struct mem_ctl_info *mci = error_desc_to_mci(e); struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]); mci->ce_mc += e->error_count; if (dimm) dimm->ce_count += e->error_count; else mci->ce_noinfo_count += e->error_count; } static void edac_inc_ue_error(struct edac_raw_error_desc *e) { int pos[EDAC_MAX_LAYERS] = { e->top_layer, e->mid_layer, e->low_layer }; struct mem_ctl_info *mci = error_desc_to_mci(e); struct dimm_info *dimm = edac_get_dimm(mci, pos[0], pos[1], pos[2]); mci->ue_mc += e->error_count; if (dimm) dimm->ue_count += e->error_count; else mci->ue_noinfo_count += e->error_count; } static void edac_ce_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); unsigned long remapped_page; if (edac_mc_get_log_ce()) { edac_mc_printk(mci, KERN_WARNING, "%d CE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx%s%s)\n", e->error_count, e->msg, *e->msg ? " " : "", e->label, e->location, e->page_frame_number, e->offset_in_page, e->grain, e->syndrome, *e->other_detail ? " - " : "", e->other_detail); } edac_inc_ce_error(e); if (mci->scrub_mode == SCRUB_SW_SRC) { /* * Some memory controllers (called MCs below) can remap * memory so that it is still available at a different * address when PCI devices map into memory. * MC's that can't do this, lose the memory where PCI * devices are mapped. This mapping is MC-dependent * and so we call back into the MC driver for it to * map the MC page to a physical (CPU) page which can * then be mapped to a virtual page - which can then * be scrubbed. */ remapped_page = mci->ctl_page_to_phys ? mci->ctl_page_to_phys(mci, e->page_frame_number) : e->page_frame_number; edac_mc_scrub_block(remapped_page, e->offset_in_page, e->grain); } } static void edac_ue_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); if (edac_mc_get_log_ue()) { edac_mc_printk(mci, KERN_WARNING, "%d UE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld%s%s)\n", e->error_count, e->msg, *e->msg ? " " : "", e->label, e->location, e->page_frame_number, e->offset_in_page, e->grain, *e->other_detail ? " - " : "", e->other_detail); } edac_inc_ue_error(e); if (edac_mc_get_panic_on_ue()) { panic("UE %s%son %s (%s page:0x%lx offset:0x%lx grain:%ld%s%s)\n", e->msg, *e->msg ? " " : "", e->label, e->location, e->page_frame_number, e->offset_in_page, e->grain, *e->other_detail ? " - " : "", e->other_detail); } } static void edac_inc_csrow(struct edac_raw_error_desc *e, int row, int chan) { struct mem_ctl_info *mci = error_desc_to_mci(e); enum hw_event_mc_err_type type = e->type; u16 count = e->error_count; if (row < 0) return; edac_dbg(4, "csrow/channel to increment: (%d,%d)\n", row, chan); if (type == HW_EVENT_ERR_CORRECTED) { mci->csrows[row]->ce_count += count; if (chan >= 0) mci->csrows[row]->channels[chan]->ce_count += count; } else { mci->csrows[row]->ue_count += count; } } void edac_raw_mc_handle_error(struct edac_raw_error_desc *e) { struct mem_ctl_info *mci = error_desc_to_mci(e); u8 grain_bits; /* Sanity-check driver-supplied grain value. */ if (WARN_ON_ONCE(!e->grain)) e->grain = 1; grain_bits = fls_long(e->grain - 1); /* Report the error via the trace interface */ if (IS_ENABLED(CONFIG_RAS)) trace_mc_event(e->type, e->msg, e->label, e->error_count, mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page, grain_bits, e->syndrome, e->other_detail); if (e->type == HW_EVENT_ERR_CORRECTED) edac_ce_error(e); else edac_ue_error(e); } EXPORT_SYMBOL_GPL(edac_raw_mc_handle_error); void edac_mc_handle_error(const enum hw_event_mc_err_type type, struct mem_ctl_info *mci, const u16 error_count, const unsigned long page_frame_number, const unsigned long offset_in_page, const unsigned long syndrome, const int top_layer, const int mid_layer, const int low_layer, const char *msg, const char *other_detail) { struct dimm_info *dimm; char *p, *end; int row = -1, chan = -1; int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer }; int i, n_labels = 0; struct edac_raw_error_desc *e = &mci->error_desc; bool any_memory = true; const char *prefix; edac_dbg(3, "MC%d\n", mci->mc_idx); /* Fills the error report buffer */ memset(e, 0, sizeof (*e)); e->error_count = error_count; e->type = type; e->top_layer = top_layer; e->mid_layer = mid_layer; e->low_layer = low_layer; e->page_frame_number = page_frame_number; e->offset_in_page = offset_in_page; e->syndrome = syndrome; /* need valid strings here for both: */ e->msg = msg ?: ""; e->other_detail = other_detail ?: ""; /* * Check if the event report is consistent and if the memory location is * known. If it is, the DIMM(s) label info will be filled and the DIMM's * error counters will be incremented. */ for (i = 0; i < mci->n_layers; i++) { if (pos[i] >= (int)mci->layers[i].size) { edac_mc_printk(mci, KERN_ERR, "INTERNAL ERROR: %s value is out of range (%d >= %d)\n", edac_layer_name[mci->layers[i].type], pos[i], mci->layers[i].size); /* * Instead of just returning it, let's use what's * known about the error. The increment routines and * the DIMM filter logic will do the right thing by * pointing the likely damaged DIMMs. */ pos[i] = -1; } if (pos[i] >= 0) any_memory = false; } /* * Get the dimm label/grain that applies to the match criteria. * As the error algorithm may not be able to point to just one memory * stick, the logic here will get all possible labels that could * pottentially be affected by the error. * On FB-DIMM memory controllers, for uncorrected errors, it is common * to have only the MC channel and the MC dimm (also called "branch") * but the channel is not known, as the memory is arranged in pairs, * where each memory belongs to a separate channel within the same * branch. */ p = e->label; *p = '\0'; end = p + sizeof(e->label); prefix = ""; mci_for_each_dimm(mci, dimm) { if (top_layer >= 0 && top_layer != dimm->location[0]) continue; if (mid_layer >= 0 && mid_layer != dimm->location[1]) continue; if (low_layer >= 0 && low_layer != dimm->location[2]) continue; /* get the max grain, over the error match range */ if (dimm->grain > e->grain) e->grain = dimm->grain; /* * If the error is memory-controller wide, there's no need to * seek for the affected DIMMs because the whole channel/memory * controller/... may be affected. Also, don't show errors for * empty DIMM slots. */ if (!dimm->nr_pages) continue; n_labels++; if (n_labels > EDAC_MAX_LABELS) { p = e->label; *p = '\0'; } else { p += scnprintf(p, end - p, "%s%s", prefix, dimm->label); prefix = OTHER_LABEL; } /* * get csrow/channel of the DIMM, in order to allow * incrementing the compat API counters */ edac_dbg(4, "%s csrows map: (%d,%d)\n", mci->csbased ? "rank" : "dimm", dimm->csrow, dimm->cschannel); if (row == -1) row = dimm->csrow; else if (row >= 0 && row != dimm->csrow) row = -2; if (chan == -1) chan = dimm->cschannel; else if (chan >= 0 && chan != dimm->cschannel) chan = -2; } if (any_memory) strscpy(e->label, "any memory", sizeof(e->label)); else if (!*e->label) strscpy(e->label, "unknown memory", sizeof(e->label)); edac_inc_csrow(e, row, chan); /* Fill the RAM location data */ p = e->location; end = p + sizeof(e->location); prefix = ""; for (i = 0; i < mci->n_layers; i++) { if (pos[i] < 0) continue; p += scnprintf(p, end - p, "%s%s:%d", prefix, edac_layer_name[mci->layers[i].type], pos[i]); prefix = " "; } edac_raw_mc_handle_error(e); } EXPORT_SYMBOL_GPL(edac_mc_handle_error); |
| 9 9 9 9 2 2 1 1 11 11 4 3 2 2 2 9 9 7 8 8 8 8 8 8 7 7 7 7 8 10 1 1 10 9 1 8 8 8 8 8 8 8 1 1 2 2 2 2 2 2 2 2 2 8 8 8 6 1 9 9 9 9 9 9 8 8 8 9 6 9 9 9 9 9 9 9 9 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/phy.h> #include "netlink.h" #include "common.h" struct strset_info { bool per_dev; bool free_strings; unsigned int count; const char (*strings)[ETH_GSTRING_LEN]; }; static const struct strset_info info_template[] = { [ETH_SS_TEST] = { .per_dev = true, }, [ETH_SS_STATS] = { .per_dev = true, }, [ETH_SS_PRIV_FLAGS] = { .per_dev = true, }, [ETH_SS_FEATURES] = { .per_dev = false, .count = ARRAY_SIZE(netdev_features_strings), .strings = netdev_features_strings, }, [ETH_SS_RSS_HASH_FUNCS] = { .per_dev = false, .count = ARRAY_SIZE(rss_hash_func_strings), .strings = rss_hash_func_strings, }, [ETH_SS_TUNABLES] = { .per_dev = false, .count = ARRAY_SIZE(tunable_strings), .strings = tunable_strings, }, [ETH_SS_PHY_STATS] = { .per_dev = true, }, [ETH_SS_PHY_TUNABLES] = { .per_dev = false, .count = ARRAY_SIZE(phy_tunable_strings), .strings = phy_tunable_strings, }, [ETH_SS_LINK_MODES] = { .per_dev = false, .count = __ETHTOOL_LINK_MODE_MASK_NBITS, .strings = link_mode_names, }, [ETH_SS_MSG_CLASSES] = { .per_dev = false, .count = NETIF_MSG_CLASS_COUNT, .strings = netif_msg_class_names, }, [ETH_SS_WOL_MODES] = { .per_dev = false, .count = WOL_MODE_COUNT, .strings = wol_mode_names, }, [ETH_SS_SOF_TIMESTAMPING] = { .per_dev = false, .count = __SOF_TIMESTAMPING_CNT, .strings = sof_timestamping_names, }, [ETH_SS_TS_TX_TYPES] = { .per_dev = false, .count = __HWTSTAMP_TX_CNT, .strings = ts_tx_type_names, }, [ETH_SS_TS_RX_FILTERS] = { .per_dev = false, .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, [ETH_SS_TS_FLAGS] = { .per_dev = false, .count = __HWTSTAMP_FLAG_CNT, .strings = ts_flags_names, }, [ETH_SS_UDP_TUNNEL_TYPES] = { .per_dev = false, .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, .strings = udp_tunnel_type_names, }, [ETH_SS_STATS_STD] = { .per_dev = false, .count = __ETHTOOL_STATS_CNT, .strings = stats_std_names, }, [ETH_SS_STATS_ETH_PHY] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, .strings = stats_eth_phy_names, }, [ETH_SS_STATS_ETH_MAC] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, .strings = stats_eth_mac_names, }, [ETH_SS_STATS_ETH_CTRL] = { .per_dev = false, .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, .strings = stats_eth_ctrl_names, }, [ETH_SS_STATS_RMON] = { .per_dev = false, .count = __ETHTOOL_A_STATS_RMON_CNT, .strings = stats_rmon_names, }, [ETH_SS_STATS_PHY] = { .per_dev = false, .count = __ETHTOOL_A_STATS_PHY_CNT, .strings = stats_phy_names, }, }; struct strset_req_info { struct ethnl_req_info base; u32 req_ids; bool counts_only; }; #define STRSET_REQINFO(__req_base) \ container_of(__req_base, struct strset_req_info, base) struct strset_reply_data { struct ethnl_reply_data base; struct strset_info sets[ETH_SS_COUNT]; }; #define STRSET_REPDATA(__reply_base) \ container_of(__reply_base, struct strset_reply_data, base) const struct nla_policy ethnl_strset_get_policy[] = { [ETHTOOL_A_STRSET_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED }, [ETHTOOL_A_STRSET_COUNTS_ONLY] = { .type = NLA_FLAG }, }; static const struct nla_policy get_stringset_policy[] = { [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 }, }; /** * strset_include() - test if a string set should be included in reply * @info: parsed client request * @data: pointer to request data structure * @id: id of string set to check (ETH_SS_* constants) */ static bool strset_include(const struct strset_req_info *info, const struct strset_reply_data *data, u32 id) { bool per_dev; BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids)); if (info->req_ids) return info->req_ids & (1U << id); per_dev = data->sets[id].per_dev; if (!per_dev && !data->sets[id].strings) return false; return data->base.dev ? per_dev : !per_dev; } static int strset_get_id(const struct nlattr *nest, u32 *val, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(get_stringset_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(get_stringset_policy) - 1, nest, get_stringset_policy, extack); if (ret < 0) return ret; if (NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_STRINGSET_ID)) return -EINVAL; *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]); return 0; } static const struct nla_policy strset_stringsets_policy[] = { [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED }, }; static int strset_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS]; struct nlattr *attr; int rem, ret; if (!nest) return 0; ret = nla_validate_nested(nest, ARRAY_SIZE(strset_stringsets_policy) - 1, strset_stringsets_policy, extack); if (ret < 0) return ret; req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY]; nla_for_each_nested(attr, nest, rem) { u32 id; if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET, "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n", nla_type(attr))) return -EINVAL; ret = strset_get_id(attr, &id, extack); if (ret < 0) return ret; if (id >= ETH_SS_COUNT) { NL_SET_ERR_MSG_ATTR(extack, attr, "unknown string set id"); return -EOPNOTSUPP; } req_info->req_ids |= (1U << id); } return 0; } static void strset_cleanup_data(struct ethnl_reply_data *reply_base) { struct strset_reply_data *data = STRSET_REPDATA(reply_base); unsigned int i; for (i = 0; i < ETH_SS_COUNT; i++) if (data->sets[i].free_strings) { kfree(data->sets[i].strings); data->sets[i].strings = NULL; data->sets[i].free_strings = false; } } static int strset_prepare_set(struct strset_info *info, struct net_device *dev, struct phy_device *phydev, unsigned int id, bool counts_only) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; void *strings; int count, ret; if (id == ETH_SS_PHY_STATS && phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_sset_count) ret = phy_ops->get_sset_count(phydev); else if (ops->get_sset_count && ops->get_strings) ret = ops->get_sset_count(dev, id); else ret = -EOPNOTSUPP; if (ret <= 0) { info->count = 0; return 0; } count = ret; if (!counts_only) { strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL); if (!strings) return -ENOMEM; if (id == ETH_SS_PHY_STATS && phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_strings) phy_ops->get_strings(phydev, strings); else ops->get_strings(dev, id, strings); info->strings = strings; info->free_strings = true; } info->count = count; return 0; } static int strset_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct strset_reply_data *data = STRSET_REPDATA(reply_base); struct net_device *dev = reply_base->dev; struct nlattr **tb = info->attrs; struct phy_device *phydev; unsigned int i; int ret; BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT); memcpy(&data->sets, &info_template, sizeof(data->sets)); if (!dev) { for (i = 0; i < ETH_SS_COUNT; i++) { if ((req_info->req_ids & (1U << i)) && data->sets[i].per_dev) { GENL_SET_ERR_MSG(info, "requested per device strings without dev"); return -EINVAL; } } return 0; } phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS, info->extack); /* phydev can be NULL, check for errors only */ if (IS_ERR(phydev)) return PTR_ERR(phydev); ret = ethnl_ops_begin(dev); if (ret < 0) goto err_strset; for (i = 0; i < ETH_SS_COUNT; i++) { if (!strset_include(req_info, data, i) || !data->sets[i].per_dev) continue; ret = strset_prepare_set(&data->sets[i], dev, phydev, i, req_info->counts_only); if (ret < 0) goto err_ops; } ethnl_ops_complete(dev); return 0; err_ops: ethnl_ops_complete(dev); err_strset: strset_cleanup_data(reply_base); return ret; } /* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */ static int strset_set_size(const struct strset_info *info, bool counts_only) { unsigned int len = 0; unsigned int i; if (info->count == 0) return 0; if (counts_only) return nla_total_size(2 * nla_total_size(sizeof(u32))); for (i = 0; i < info->count; i++) { const char *str = info->strings[i]; /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */ len += nla_total_size(nla_total_size(sizeof(u32)) + ethnl_strz_size(str)); } /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */ len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len); return nla_total_size(len); } static int strset_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); const struct strset_reply_data *data = STRSET_REPDATA(reply_base); unsigned int i; int len = 0; int ret; len += nla_total_size(0); /* ETHTOOL_A_STRSET_STRINGSETS */ for (i = 0; i < ETH_SS_COUNT; i++) { const struct strset_info *set_info = &data->sets[i]; if (!strset_include(req_info, data, i)) continue; ret = strset_set_size(set_info, req_info->counts_only); if (ret < 0) return ret; len += ret; } return len; } /* fill one string into reply */ static int strset_fill_string(struct sk_buff *skb, const struct strset_info *set_info, u32 idx) { struct nlattr *string_attr; const char *value; value = set_info->strings[idx]; string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING); if (!string_attr) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) || ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value)) goto nla_put_failure; nla_nest_end(skb, string_attr); return 0; nla_put_failure: nla_nest_cancel(skb, string_attr); return -EMSGSIZE; } /* fill one string set into reply */ static int strset_fill_set(struct sk_buff *skb, const struct strset_info *set_info, u32 id, bool counts_only) { struct nlattr *stringset_attr; struct nlattr *strings_attr; unsigned int i; if (!set_info->per_dev && !set_info->strings) return -EOPNOTSUPP; if (set_info->count == 0) return 0; stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET); if (!stringset_attr) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) || nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count)) goto nla_put_failure; if (!counts_only) { strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS); if (!strings_attr) goto nla_put_failure; for (i = 0; i < set_info->count; i++) { if (strset_fill_string(skb, set_info, i) < 0) goto nla_put_failure; } nla_nest_end(skb, strings_attr); } nla_nest_end(skb, stringset_attr); return 0; nla_put_failure: nla_nest_cancel(skb, stringset_attr); return -EMSGSIZE; } static int strset_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct strset_req_info *req_info = STRSET_REQINFO(req_base); const struct strset_reply_data *data = STRSET_REPDATA(reply_base); struct nlattr *nest; unsigned int i; int ret; nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS); if (!nest) return -EMSGSIZE; for (i = 0; i < ETH_SS_COUNT; i++) { if (strset_include(req_info, data, i)) { ret = strset_fill_set(skb, &data->sets[i], i, req_info->counts_only); if (ret < 0) goto nla_put_failure; } } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return ret; } const struct ethnl_request_ops ethnl_strset_request_ops = { .request_cmd = ETHTOOL_MSG_STRSET_GET, .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY, .hdr_attr = ETHTOOL_A_STRSET_HEADER, .req_info_size = sizeof(struct strset_req_info), .reply_data_size = sizeof(struct strset_reply_data), .allow_nodev_do = true, .parse_request = strset_parse_request, .prepare_data = strset_prepare_data, .reply_size = strset_reply_size, .fill_reply = strset_fill_reply, .cleanup_data = strset_cleanup_data, }; |
| 1 3 3 3 3 1 23 51 1353 2538 2606 2621 48 57 2640 2641 2641 2640 2641 2641 2613 176 662 161 1388 1580 1580 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * vma.h * * Core VMA manipulation API implemented in vma.c. */ #ifndef __MM_VMA_H #define __MM_VMA_H /* * VMA lock generalization */ struct vma_prepare { struct vm_area_struct *vma; struct vm_area_struct *adj_next; struct file *file; struct address_space *mapping; struct anon_vma *anon_vma; struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; bool skip_vma_uprobe :1; }; struct unlink_vma_file_batch { int count; struct vm_area_struct *vmas[8]; }; /* * vma munmap operation */ struct vma_munmap_struct { struct vma_iterator *vmi; struct vm_area_struct *vma; /* The first vma to munmap */ struct vm_area_struct *prev; /* vma before the munmap area */ struct vm_area_struct *next; /* vma after the munmap area */ struct list_head *uf; /* Userfaultfd list_head */ unsigned long start; /* Aligned start addr (inclusive) */ unsigned long end; /* Aligned end addr (exclusive) */ unsigned long unmap_start; /* Unmap PTE start */ unsigned long unmap_end; /* Unmap PTE end */ int vma_count; /* Number of vmas that will be removed */ bool unlock; /* Unlock after the munmap */ bool clear_ptes; /* If there are outstanding PTE to be cleared */ /* 2 byte hole */ unsigned long nr_pages; /* Number of pages being removed */ unsigned long locked_vm; /* Number of locked pages */ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ unsigned long exec_vm; unsigned long stack_vm; unsigned long data_vm; }; enum vma_merge_state { VMA_MERGE_START, VMA_MERGE_ERROR_NOMEM, VMA_MERGE_NOMERGE, VMA_MERGE_SUCCESS, }; /* * Describes a VMA merge operation and is threaded throughout it. * * Any of the fields may be mutated by the merge operation, so no guarantees are * made to the contents of this structure after a merge operation has completed. */ struct vma_merge_struct { struct mm_struct *mm; struct vma_iterator *vmi; /* * Adjacent VMAs, any of which may be NULL if not present: * * |------|--------|------| * | prev | middle | next | * |------|--------|------| * * middle may not yet exist in the case of a proposed new VMA being * merged, or it may be an existing VMA. * * next may be assigned by the caller. */ struct vm_area_struct *prev; struct vm_area_struct *middle; struct vm_area_struct *next; /* This is the VMA we ultimately target to become the merged VMA. */ struct vm_area_struct *target; /* * Initially, the start, end, pgoff fields are provided by the caller * and describe the proposed new VMA range, whether modifying an * existing VMA (which will be 'middle'), or adding a new one. * * During the merge process these fields are updated to describe the new * range _including those VMAs which will be merged_. */ unsigned long start; unsigned long end; pgoff_t pgoff; unsigned long flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; enum vma_merge_state state; /* Flags which callers can use to modify merge behaviour: */ /* * If we can expand, simply do so. We know there is nothing to merge to * the right. Does not reset state upon failure to merge. The VMA * iterator is assumed to be positioned at the previous VMA, rather than * at the gap. */ bool just_expand :1; /* * If a merge is possible, but an OOM error occurs, give up and don't * execute the merge, returning NULL. */ bool give_up_on_oom :1; /* * If set, skip uprobe_mmap upon merged vma. */ bool skip_vma_uprobe :1; /* Internal flags set during merge process: */ /* * Internal flag indicating the merge increases vmg->middle->vm_start * (and thereby, vmg->prev->vm_end). */ bool __adjust_middle_start :1; /* * Internal flag indicating the merge decreases vmg->next->vm_start * (and thereby, vmg->middle->vm_end). */ bool __adjust_next_start :1; /* * Internal flag used during the merge operation to indicate we will * remove vmg->middle. */ bool __remove_middle :1; /* * Internal flag used during the merge operationr to indicate we will * remove vmg->next. */ bool __remove_next :1; }; static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; } /* Assumes addr >= vma->vm_start. */ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr) { return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } #define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ .flags = flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ struct vma_merge_struct name = { \ .mm = vma_->vm_mm, \ .vmi = vmi_, \ .prev = prev_, \ .middle = vma_, \ .next = NULL, \ .start = start_, \ .end = end_, \ .flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ .policy = vma_policy(vma_), \ .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm); #else #define validate_mm(mm) do { } while (0) #endif __must_check int vma_expand(struct vma_merge_struct *vmg); __must_check int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; vma_mark_attached(vma); return 0; } int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ __must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); __must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); void unlink_file_vma(struct vm_area_struct *vma); void vma_link_file(struct vm_area_struct *vma); int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); bool vma_needs_dirty_tracking(struct vm_area_struct *vma); bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* * We want to check manually if we can change individual PTEs writable * if we can't do that automatically for all PTEs in a mapping. For * private mappings, that's always the case when we have write * permissions as we properly have to handle COW. */ if (vma->vm_flags & VM_SHARED) return vma_wants_writenotify(vma, vma->vm_page_prot); return !!(vma->vm_flags & VM_WRITE); } #ifdef CONFIG_MMU static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } #endif static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev(&vmi->mas, min); } /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { __mas_set_range(&vmi->mas, index, last - 1); } static inline void vma_iter_reset(struct vma_iterator *vmi) { mas_reset(&vmi->mas); } static inline struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev_range(&vmi->mas, min); } static inline struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) { return mas_next_range(&vmi->mas, max); } static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area(&vmi->mas, min, max - 1, size); } static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area_rev(&vmi->mas, min, max - 1, size); } /* * VMA Iterator functions shared between nommu and mmap */ static inline int vma_iter_prealloc(struct vma_iterator *vmi, struct vm_area_struct *vma) { return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi) { mas_store_prealloc(&vmi->mas, NULL); } static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); } /* Store a VMA with preallocated memory */ static inline void vma_iter_store_overwrite(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_assert_attached(vma); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } #endif if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } static inline void vma_iter_store_new(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_mark_attached(vma); vma_iter_store_overwrite(vmi, vma); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, unsigned long count) { return mas_expected_entries(&vmi->mas, count); } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } /* * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or * if no previous VMA, to index 0. */ static inline struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, struct vm_area_struct **pprev) { struct vm_area_struct *next = vma_next(vmi); struct vm_area_struct *prev = vma_prev(vmi); /* * Consider the case where no previous VMA exists. We advance to the * next VMA, skipping any gap, then rewind to the start of the range. * * If we were to unconditionally advance to the next range we'd wind up * at the next VMA again, so we check to ensure there is a previous VMA * to skip over. */ if (prev) vma_iter_next_range(vmi); if (pprev) *pprev = prev; return next; } #ifdef CONFIG_64BIT static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } /* * check if a vma is sealed for modification. * return true, if modification is allowed. */ static inline bool can_modify_vma(struct vm_area_struct *vma) { if (unlikely(vma_is_sealed(vma))) return false; return true; } bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); #else static inline bool can_modify_vma(struct vm_area_struct *vma) { return true; } static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { return true; } #endif #if defined(CONFIG_STACK_GROWSUP) int expand_upwards(struct vm_area_struct *vma, unsigned long address); #endif int expand_downwards(struct vm_area_struct *vma, unsigned long address); int __vm_munmap(unsigned long start, size_t len, bool unlock); int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); /* vma_init.h, shared between CONFIG_MMU and nommu. */ void __init vma_state_init(void); struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); void vm_area_free(struct vm_area_struct *vma); /* vma_exec.c */ #ifdef CONFIG_MMU int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, unsigned long *top_mem_p); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif #endif /* __MM_VMA_H */ |
| 102 93 137 137 84 102 141 94 83 1 87 94 94 94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IRQ_H #define _LINUX_IRQ_H /* * Please do not include this file in generic code. There is currently * no requirement for any architecture to implement anything held * within this file. * * Thanks. --rmk */ #include <linux/cache.h> #include <linux/spinlock.h> #include <linux/cpumask.h> #include <linux/irqhandler.h> #include <linux/irqreturn.h> #include <linux/irqnr.h> #include <linux/topology.h> #include <linux/io.h> #include <linux/slab.h> #include <asm/irq.h> #include <asm/ptrace.h> #include <asm/irq_regs.h> struct seq_file; struct module; struct msi_msg; struct irq_affinity_desc; enum irqchip_irq_state; /* * IRQ line status. * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered * IRQ_TYPE_EDGE_BOTH - rising and falling edge triggered * IRQ_TYPE_LEVEL_HIGH - high level triggered * IRQ_TYPE_LEVEL_LOW - low level triggered * IRQ_TYPE_LEVEL_MASK - Mask to filter out the level bits * IRQ_TYPE_SENSE_MASK - Mask for all the above bits * IRQ_TYPE_DEFAULT - For use by some PICs to ask irq_set_type * to setup the HW to a sane default (used * by irqdomain map() callbacks to synchronize * the HW state and SW flags for a newly * allocated descriptor). * * IRQ_TYPE_PROBE - Special flag for probing in progress * * Bits which can be modified via irq_set/clear/modify_status_flags() * IRQ_LEVEL - Interrupt is level type. Will be also * updated in the code when the above trigger * bits are modified via irq_set_irq_type() * IRQ_PER_CPU - Mark an interrupt PER_CPU. Will protect * it from affinity setting * IRQ_NOPROBE - Interrupt cannot be probed by autoprobing * IRQ_NOREQUEST - Interrupt cannot be requested via * request_irq() * IRQ_NOTHREAD - Interrupt cannot be threaded * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) * IRQ_NESTED_THREAD - Interrupt nests into another thread * IRQ_PER_CPU_DEVID - Dev_id is a per-cpu variable * IRQ_IS_POLLED - Always polled by another interrupt. Exclude * it from the spurious interrupt detection * mechanism and from core side polling. * IRQ_DISABLE_UNLAZY - Disable lazy irq disable * IRQ_HIDDEN - Don't show up in /proc/interrupts * IRQ_NO_DEBUG - Exclude from note_interrupt() debugging */ enum { IRQ_TYPE_NONE = 0x00000000, IRQ_TYPE_EDGE_RISING = 0x00000001, IRQ_TYPE_EDGE_FALLING = 0x00000002, IRQ_TYPE_EDGE_BOTH = (IRQ_TYPE_EDGE_FALLING | IRQ_TYPE_EDGE_RISING), IRQ_TYPE_LEVEL_HIGH = 0x00000004, IRQ_TYPE_LEVEL_LOW = 0x00000008, IRQ_TYPE_LEVEL_MASK = (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH), IRQ_TYPE_SENSE_MASK = 0x0000000f, IRQ_TYPE_DEFAULT = IRQ_TYPE_SENSE_MASK, IRQ_TYPE_PROBE = 0x00000010, IRQ_LEVEL = (1 << 8), IRQ_PER_CPU = (1 << 9), IRQ_NOPROBE = (1 << 10), IRQ_NOREQUEST = (1 << 11), IRQ_NOAUTOEN = (1 << 12), IRQ_NO_BALANCING = (1 << 13), IRQ_NESTED_THREAD = (1 << 15), IRQ_NOTHREAD = (1 << 16), IRQ_PER_CPU_DEVID = (1 << 17), IRQ_IS_POLLED = (1 << 18), IRQ_DISABLE_UNLAZY = (1 << 19), IRQ_HIDDEN = (1 << 20), IRQ_NO_DEBUG = (1 << 21), }; #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ IRQ_NOAUTOEN | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_HIDDEN) #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) /* * Return value for chip->irq_set_affinity() * * IRQ_SET_MASK_OK - OK, core updates irq_common_data.affinity * IRQ_SET_MASK_NOCOPY - OK, chip did update irq_common_data.affinity * IRQ_SET_MASK_OK_DONE - Same as IRQ_SET_MASK_OK for core. Special code to * support stacked irqchips, which indicates skipping * all descendant irqchips. */ enum { IRQ_SET_MASK_OK = 0, IRQ_SET_MASK_OK_NOCOPY, IRQ_SET_MASK_OK_DONE, }; struct msi_desc; struct irq_domain; /** * struct irq_common_data - per irq data shared by all irqchips * @state_use_accessors: status information for irq chip functions. * Use accessor functions to deal with it * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods * @affinity: IRQ affinity on SMP. If this is an IPI * related irq, then this is the mask of the * CPUs to which an IPI can be sent. * @effective_affinity: The effective IRQ affinity on SMP as some irq * chips do not allow multi CPU destinations. * A subset of @affinity. * @msi_desc: MSI descriptor * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ struct irq_common_data { unsigned int __private state_use_accessors; #ifdef CONFIG_NUMA unsigned int node; #endif void *handler_data; struct msi_desc *msi_desc; #ifdef CONFIG_SMP cpumask_var_t affinity; #endif #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK cpumask_var_t effective_affinity; #endif #ifdef CONFIG_GENERIC_IRQ_IPI unsigned int ipi_offset; #endif }; /** * struct irq_data - per irq chip data passed down to chip functions * @mask: precomputed bitmask for accessing the chip registers * @irq: interrupt number * @hwirq: hardware interrupt number, local to the interrupt domain * @common: point to data shared by all irqchips * @chip: low level interrupt hardware access * @domain: Interrupt translation domain; responsible for mapping * between hwirq number and linux irq number. * @parent_data: pointer to parent struct irq_data to support hierarchy * irq_domain * @chip_data: platform-specific per-chip private data for the chip * methods, to allow shared chip implementations */ struct irq_data { u32 mask; unsigned int irq; irq_hw_number_t hwirq; struct irq_common_data *common; struct irq_chip *chip; struct irq_domain *domain; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data *parent_data; #endif void *chip_data; }; /* * Bit masks for irq_common_data.state_use_accessors * * IRQD_TRIGGER_MASK - Mask for the trigger type bits * IRQD_SETAFFINITY_PENDING - Affinity setting is pending * IRQD_ACTIVATED - Interrupt has already been activated * IRQD_NO_BALANCING - Balancing disabled for this IRQ * IRQD_PER_CPU - Interrupt is per cpu * IRQD_AFFINITY_SET - Interrupt affinity was set * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt * IRQD_WAKEUP_ARMED - Wakeup mode armed * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel * IRQD_IRQ_STARTED - Startup state of the interrupt * IRQD_MANAGED_SHUTDOWN - Interrupt was shutdown due to empty affinity * mask. Applies only to affinity managed irqs. * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target * IRQD_DEFAULT_TRIGGER_SET - Expected trigger already been set * IRQD_CAN_RESERVE - Can use reservation mode * IRQD_HANDLE_ENFORCE_IRQCTX - Enforce that handle_irq_*() is only invoked * from actual interrupt context. * IRQD_AFFINITY_ON_ACTIVATE - Affinity is set on activation. Don't call * irq_chip::irq_set_affinity() when deactivated. * IRQD_IRQ_ENABLED_ON_SUSPEND - Interrupt is enabled on suspend by irq pm if * irqchip have flag IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND set. * IRQD_RESEND_WHEN_IN_PROGRESS - Interrupt may fire when already in progress in which * case it must be resent at the next available opportunity. */ enum { IRQD_TRIGGER_MASK = 0xf, IRQD_SETAFFINITY_PENDING = BIT(8), IRQD_ACTIVATED = BIT(9), IRQD_NO_BALANCING = BIT(10), IRQD_PER_CPU = BIT(11), IRQD_AFFINITY_SET = BIT(12), IRQD_LEVEL = BIT(13), IRQD_WAKEUP_STATE = BIT(14), IRQD_IRQ_DISABLED = BIT(16), IRQD_IRQ_MASKED = BIT(17), IRQD_IRQ_INPROGRESS = BIT(18), IRQD_WAKEUP_ARMED = BIT(19), IRQD_FORWARDED_TO_VCPU = BIT(20), IRQD_AFFINITY_MANAGED = BIT(21), IRQD_IRQ_STARTED = BIT(22), IRQD_MANAGED_SHUTDOWN = BIT(23), IRQD_SINGLE_TARGET = BIT(24), IRQD_DEFAULT_TRIGGER_SET = BIT(25), IRQD_CAN_RESERVE = BIT(26), IRQD_HANDLE_ENFORCE_IRQCTX = BIT(27), IRQD_AFFINITY_ON_ACTIVATE = BIT(28), IRQD_IRQ_ENABLED_ON_SUSPEND = BIT(29), IRQD_RESEND_WHEN_IN_PROGRESS = BIT(30), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline bool irqd_is_setaffinity_pending(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SETAFFINITY_PENDING; } static inline bool irqd_is_per_cpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_PER_CPU; } static inline bool irqd_can_balance(struct irq_data *d) { return !(__irqd_to_state(d) & (IRQD_PER_CPU | IRQD_NO_BALANCING)); } static inline bool irqd_affinity_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_SET; } static inline void irqd_mark_affinity_was_set(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_SET; } static inline bool irqd_trigger_type_was_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_DEFAULT_TRIGGER_SET; } static inline u32 irqd_get_trigger_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_TRIGGER_MASK; } /* * Must only be called inside irq_chip.irq_set_type() functions or * from the DT/ACPI setup code. */ static inline void irqd_set_trigger_type(struct irq_data *d, u32 type) { __irqd_to_state(d) &= ~IRQD_TRIGGER_MASK; __irqd_to_state(d) |= type & IRQD_TRIGGER_MASK; __irqd_to_state(d) |= IRQD_DEFAULT_TRIGGER_SET; } static inline bool irqd_is_level_type(struct irq_data *d) { return __irqd_to_state(d) & IRQD_LEVEL; } /* * Must only be called of irqchip.irq_set_affinity() or low level * hierarchy domain allocation functions. */ static inline void irqd_set_single_target(struct irq_data *d) { __irqd_to_state(d) |= IRQD_SINGLE_TARGET; } static inline bool irqd_is_single_target(struct irq_data *d) { return __irqd_to_state(d) & IRQD_SINGLE_TARGET; } static inline void irqd_set_handle_enforce_irqctx(struct irq_data *d) { __irqd_to_state(d) |= IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_handle_enforce_irqctx(struct irq_data *d) { return __irqd_to_state(d) & IRQD_HANDLE_ENFORCE_IRQCTX; } static inline bool irqd_is_enabled_on_suspend(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_ENABLED_ON_SUSPEND; } static inline bool irqd_is_wakeup_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } static inline bool irqd_irq_disabled(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_DISABLED; } static inline bool irqd_irq_masked(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_MASKED; } static inline bool irqd_irq_inprogress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_INPROGRESS; } static inline bool irqd_is_wakeup_armed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_ARMED; } static inline bool irqd_is_forwarded_to_vcpu(struct irq_data *d) { return __irqd_to_state(d) & IRQD_FORWARDED_TO_VCPU; } static inline void irqd_set_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) |= IRQD_FORWARDED_TO_VCPU; } static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU; } static inline bool irqd_affinity_is_managed(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_MANAGED; } static inline bool irqd_is_activated(struct irq_data *d) { return __irqd_to_state(d) & IRQD_ACTIVATED; } static inline void irqd_set_activated(struct irq_data *d) { __irqd_to_state(d) |= IRQD_ACTIVATED; } static inline void irqd_clr_activated(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_ACTIVATED; } static inline bool irqd_is_started(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_STARTED; } static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } static inline void irqd_set_can_reserve(struct irq_data *d) { __irqd_to_state(d) |= IRQD_CAN_RESERVE; } static inline void irqd_clr_can_reserve(struct irq_data *d) { __irqd_to_state(d) &= ~IRQD_CAN_RESERVE; } static inline bool irqd_can_reserve(struct irq_data *d) { return __irqd_to_state(d) & IRQD_CAN_RESERVE; } static inline void irqd_set_affinity_on_activate(struct irq_data *d) { __irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE; } static inline bool irqd_affinity_on_activate(struct irq_data *d) { return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; } static inline void irqd_set_resend_when_in_progress(struct irq_data *d) { __irqd_to_state(d) |= IRQD_RESEND_WHEN_IN_PROGRESS; } static inline bool irqd_needs_resend_when_in_progress(struct irq_data *d) { return __irqd_to_state(d) & IRQD_RESEND_WHEN_IN_PROGRESS; } #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; } /** * struct irq_chip - hardware interrupt chip descriptor * * @name: name for /proc/interrupts * @irq_startup: start up the interrupt (defaults to ->enable if NULL) * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL) * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL) * @irq_disable: disable the interrupt * @irq_ack: start of a new interrupt * @irq_mask: mask an interrupt source * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force * argument is true, it tells the driver to * unconditionally apply the affinity setting. Sanity * checks against the supplied affinity mask are not * required. This is used for CPU hotplug where the * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips * @irq_cpu_online: configure an interrupt source for a secondary CPU * @irq_cpu_offline: un-configure an interrupt source for a secondary CPU * @irq_suspend: function called from core code on suspend once per * chip, when one or more interrupts are installed * @irq_resume: function called from core code on resume once per chip, * when one ore more interrupts are installed * @irq_pm_shutdown: function called from core code on shutdown once per chip * @irq_calc_mask: Optional function to set irq_data.mask for special cases * @irq_print_chip: optional to print special chip info in show_interrupts * @irq_request_resources: optional to request resources before calling * any other callback related to this irq * @irq_release_resources: optional to release resources acquired with * irq_request_resources * @irq_compose_msi_msg: optional to compose message content for MSI * @irq_write_msi_msg: optional to write message content for MSI * @irq_get_irqchip_state: return the internal state of an interrupt * @irq_set_irqchip_state: set the internal state of a interrupt * @irq_set_vcpu_affinity: optional to target a vCPU in a virtual machine * @ipi_send_single: send a single IPI to destination cpus * @ipi_send_mask: send an IPI to destination cpus in cpumask * @irq_nmi_setup: function called from core code before enabling an NMI * @irq_nmi_teardown: function called from core code after disabling an NMI * @irq_force_complete_move: optional function to force complete pending irq move * @flags: chip specific flags */ struct irq_chip { const char *name; unsigned int (*irq_startup)(struct irq_data *data); void (*irq_shutdown)(struct irq_data *data); void (*irq_enable)(struct irq_data *data); void (*irq_disable)(struct irq_data *data); void (*irq_ack)(struct irq_data *data); void (*irq_mask)(struct irq_data *data); void (*irq_mask_ack)(struct irq_data *data); void (*irq_unmask)(struct irq_data *data); void (*irq_eoi)(struct irq_data *data); int (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force); int (*irq_retrigger)(struct irq_data *data); int (*irq_set_type)(struct irq_data *data, unsigned int flow_type); int (*irq_set_wake)(struct irq_data *data, unsigned int on); void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE void (*irq_cpu_online)(struct irq_data *data); void (*irq_cpu_offline)(struct irq_data *data); #endif void (*irq_suspend)(struct irq_data *data); void (*irq_resume)(struct irq_data *data); void (*irq_pm_shutdown)(struct irq_data *data); void (*irq_calc_mask)(struct irq_data *data); void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); int (*irq_request_resources)(struct irq_data *data); void (*irq_release_resources)(struct irq_data *data); void (*irq_compose_msi_msg)(struct irq_data *data, struct msi_msg *msg); void (*irq_write_msi_msg)(struct irq_data *data, struct msi_msg *msg); int (*irq_get_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool *state); int (*irq_set_irqchip_state)(struct irq_data *data, enum irqchip_irq_state which, bool state); int (*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info); void (*ipi_send_single)(struct irq_data *data, unsigned int cpu); void (*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest); int (*irq_nmi_setup)(struct irq_data *data); void (*irq_nmi_teardown)(struct irq_data *data); void (*irq_force_complete_move)(struct irq_data *data); unsigned long flags; }; /* * irq_chip specific flags * * IRQCHIP_SET_TYPE_MASKED: Mask before calling chip.irq_set_type() * IRQCHIP_EOI_IF_HANDLED: Only issue irq_eoi() when irq was handled * IRQCHIP_MASK_ON_SUSPEND: Mask non wake irqs in the suspend path * IRQCHIP_ONOFFLINE_ENABLED: Only call irq_on/off_line callbacks * when irq enabled * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode * IRQCHIP_SUPPORTS_LEVEL_MSI: Chip can provide two doorbells for Level MSIs * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup * IRQCHIP_IMMUTABLE: Don't ever change anything in this chip * IRQCHIP_MOVE_DEFERRED: Move the interrupt in actual interrupt context */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), IRQCHIP_EOI_IF_HANDLED = (1 << 1), IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), IRQCHIP_ONESHOT_SAFE = (1 << 5), IRQCHIP_EOI_THREADED = (1 << 6), IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), IRQCHIP_IMMUTABLE = (1 << 11), IRQCHIP_MOVE_DEFERRED = (1 << 12), }; #include <linux/irqdesc.h> /* * Pick up the arch-dependent methods: */ #include <asm/hw_irq.h> #ifndef NR_IRQS_LEGACY # define NR_IRQS_LEGACY 0 #endif #ifndef ARCH_IRQ_INIT_FLAGS # define ARCH_IRQ_INIT_FLAGS 0 #endif #define IRQ_DEFAULT_INIT_FLAGS ARCH_IRQ_INIT_FLAGS struct irqaction; extern int setup_percpu_irq(unsigned int irq, struct irqaction *new); #ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE extern void irq_cpu_online(void); extern void irq_cpu_offline(void); #endif extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_IRQ_MIGRATION) extern void irq_migrate_all_off_this_cpu(void); extern int irq_affinity_online_cpu(unsigned int cpu); #else # define irq_affinity_online_cpu NULL #endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) bool irq_can_move_in_process_context(struct irq_data *data); void __irq_move_irq(struct irq_data *data); static inline void irq_move_irq(struct irq_data *data) { if (unlikely(irqd_is_setaffinity_pending(data))) __irq_move_irq(data); } void irq_move_masked_irq(struct irq_data *data); #else static inline bool irq_can_move_in_process_context(struct irq_data *data) { return true; } static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } #endif extern int no_irq_affinity; #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq); #else static inline int irq_set_parent(int irq, int parent_irq) { return 0; } #endif /* * Built-in IRQ handlers for various IRQ types, * callable via desc->handle_irq() */ extern void handle_level_irq(struct irq_desc *desc); extern void handle_fasteoi_irq(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); extern void handle_untracked_irq(struct irq_desc *desc); extern void handle_percpu_irq(struct irq_desc *desc); extern void handle_percpu_devid_irq(struct irq_desc *desc); extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); extern int irq_chip_pm_put(struct irq_data *data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY extern void handle_fasteoi_ack_irq(struct irq_desc *desc); extern void handle_fasteoi_mask_irq(struct irq_desc *desc); extern int irq_chip_set_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool val); extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); extern int irq_chip_retrigger_hierarchy(struct irq_data *data); extern void irq_chip_mask_parent(struct irq_data *data); extern void irq_chip_mask_ack_parent(struct irq_data *data); extern void irq_chip_unmask_parent(struct irq_data *data); extern void irq_chip_eoi_parent(struct irq_data *data); extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); extern int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info); extern int irq_chip_set_type_parent(struct irq_data *data, unsigned int type); extern int irq_chip_request_resources_parent(struct irq_data *data); extern void irq_chip_release_resources_parent(struct irq_data *data); #endif /* Disable or mask interrupts during a kernel kexec */ extern void machine_kexec_mask_interrupts(void); /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); /* Enable/disable irq debugging output: */ extern int noirqdebug_setup(char *str); /* Checks whether the interrupt can be requested by request_irq(): */ extern bool can_request_irq(unsigned int irq, unsigned long irqflags); /* Dummy irq-chip implementations: */ extern struct irq_chip no_irq_chip; extern struct irq_chip dummy_irq_chip; extern void irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name); static inline void irq_set_chip_and_handler(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle) { irq_set_chip_and_handler_name(irq, chip, handle, NULL); } extern int irq_set_percpu_devid(unsigned int irq); extern int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity); extern int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); static inline void irq_set_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 0, NULL); } /* * Set a highlevel chained flow handler for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ static inline void irq_set_chained_handler(unsigned int irq, irq_flow_handler_t handle) { __irq_set_handler(irq, handle, 1, NULL); } /* * Set a highlevel chained flow handler and its data for a given IRQ. * (a chained handler is automatically enabled and set to * IRQ_NOREQUEST, IRQ_NOPROBE, and IRQ_NOTHREAD) */ void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, void *data); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set); static inline void irq_set_status_flags(unsigned int irq, unsigned long set) { irq_modify_status(irq, 0, set); } static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr) { irq_modify_status(irq, clr, 0); } static inline void irq_set_noprobe(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOPROBE); } static inline void irq_set_probe(unsigned int irq) { irq_modify_status(irq, IRQ_NOPROBE, 0); } static inline void irq_set_nothread(unsigned int irq) { irq_modify_status(irq, 0, IRQ_NOTHREAD); } static inline void irq_set_thread(unsigned int irq) { irq_modify_status(irq, IRQ_NOTHREAD, 0); } static inline void irq_set_nested_thread(unsigned int irq, bool nest) { if (nest) irq_set_status_flags(irq, IRQ_NESTED_THREAD); else irq_clear_status_flags(irq, IRQ_NESTED_THREAD); } static inline void irq_set_percpu_devid_flags(unsigned int irq) { irq_set_status_flags(irq, IRQ_NOAUTOEN | IRQ_PER_CPU | IRQ_NOTHREAD | IRQ_NOPROBE | IRQ_PER_CPU_DEVID); } /* Set/get chip/data for an IRQ: */ extern int irq_set_chip(unsigned int irq, const struct irq_chip *chip); extern int irq_set_handler_data(unsigned int irq, void *data); extern int irq_set_chip_data(unsigned int irq, void *data); extern int irq_set_irq_type(unsigned int irq, unsigned int type); extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry); extern struct irq_data *irq_get_irq_data(unsigned int irq); static inline struct irq_chip *irq_get_chip(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip : NULL; } static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d) { return d->chip; } static inline void *irq_get_chip_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->chip_data : NULL; } static inline void *irq_data_get_irq_chip_data(struct irq_data *d) { return d->chip_data; } static inline void *irq_get_handler_data(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->handler_data : NULL; } static inline void *irq_data_get_irq_handler_data(struct irq_data *d) { return d->common->handler_data; } static inline struct msi_desc *irq_get_msi_desc(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? d->common->msi_desc : NULL; } static inline struct msi_desc *irq_data_get_msi_desc(struct irq_data *d) { return d->common->msi_desc; } static inline u32 irq_get_trigger_type(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irqd_get_trigger_type(d) : 0; } static inline int irq_common_data_get_node(struct irq_common_data *d) { #ifdef CONFIG_NUMA return d->node; #else return 0; #endif } static inline int irq_data_get_node(struct irq_data *d) { return irq_common_data_get_node(d->common); } static inline const struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) { #ifdef CONFIG_SMP return d->common->affinity; #else return cpumask_of(0); #endif } static inline void irq_data_update_affinity(struct irq_data *d, const struct cpumask *m) { #ifdef CONFIG_SMP cpumask_copy(d->common->affinity, m); #endif } static inline const struct cpumask *irq_get_affinity_mask(int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_affinity_mask(d) : NULL; } #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return d->common->effective_affinity; } static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { cpumask_copy(d->common->effective_affinity, m); } #else static inline void irq_data_update_effective_affinity(struct irq_data *d, const struct cpumask *m) { } static inline const struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) { return irq_data_get_affinity_mask(d); } #endif static inline const struct cpumask *irq_get_effective_affinity_mask(unsigned int irq) { struct irq_data *d = irq_get_irq_data(irq); return d ? irq_data_get_effective_affinity_mask(d) : NULL; } unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity); /* use macros to avoid needing export.h for THIS_MODULE */ #define irq_alloc_descs(irq, from, cnt, node) \ __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE, NULL) #define irq_alloc_desc(node) \ irq_alloc_descs(-1, 1, 1, node) #define irq_alloc_desc_at(at, node) \ irq_alloc_descs(at, at, 1, node) #define irq_alloc_desc_from(from, node) \ irq_alloc_descs(-1, from, 1, node) #define irq_alloc_descs_from(from, cnt, node) \ irq_alloc_descs(-1, from, cnt, node) #define devm_irq_alloc_descs(dev, irq, from, cnt, node) \ __devm_irq_alloc_descs(dev, irq, from, cnt, node, THIS_MODULE, NULL) #define devm_irq_alloc_desc(dev, node) \ devm_irq_alloc_descs(dev, -1, 1, 1, node) #define devm_irq_alloc_desc_at(dev, at, node) \ devm_irq_alloc_descs(dev, at, at, 1, node) #define devm_irq_alloc_desc_from(dev, from, node) \ devm_irq_alloc_descs(dev, -1, from, 1, node) #define devm_irq_alloc_descs_from(dev, from, cnt, node) \ devm_irq_alloc_descs(dev, -1, from, cnt, node) void irq_free_descs(unsigned int irq, unsigned int cnt); static inline void irq_free_desc(unsigned int irq) { irq_free_descs(irq, 1); } #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq); #endif /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base * @disable: Disable register offset to reg_base * @mask: Mask register offset to reg_base * @ack: Ack register offset to reg_base * @eoi: Eoi register offset to reg_base * @type: Type configuration register offset to reg_base */ struct irq_chip_regs { unsigned long enable; unsigned long disable; unsigned long mask; unsigned long ack; unsigned long eoi; unsigned long type; }; /** * struct irq_chip_type - Generic interrupt chip instance for a flow type * @chip: The real interrupt chip which provides the callbacks * @regs: Register offsets for this chip * @handler: Flow handler associated with this chip * @type: Chip can handle these flow types * @mask_cache_priv: Cached mask register private to the chip type * @mask_cache: Pointer to cached mask register * * A irq_generic_chip can have several instances of irq_chip_type when * it requires different functions and register offsets for different * flow types. */ struct irq_chip_type { struct irq_chip chip; struct irq_chip_regs regs; irq_flow_handler_t handler; u32 type; u32 mask_cache_priv; u32 *mask_cache; }; /** * struct irq_chip_generic - Generic irq chip data structure * @lock: Lock to protect register and cache data access * @reg_base: Register base address (virtual) * @reg_readl: Alternate I/O accessor (defaults to readl if NULL) * @reg_writel: Alternate I/O accessor (defaults to writel if NULL) * @suspend: Function called from core code on suspend once per * chip; can be useful instead of irq_chip::suspend to * handle chip details even when no interrupts are in use * @resume: Function called from core code on resume once per chip; * can be useful instead of irq_chip::suspend to handle * chip details even when no interrupts are in use * @irq_base: Interrupt base nr for this chip * @irq_cnt: Number of interrupts handled by this chip * @mask_cache: Cached mask register shared between all chip types * @wake_enabled: Interrupt can wakeup from suspend * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) * @private: Private data for non generic chip callbacks * @installed: bitfield to denote installed interrupts * @unused: bitfield to denote unused interrupts * @domain: irq domain pointer * @list: List head for keeping track of instances * @chip_types: Array of interrupt irq_chip_types * * Note, that irq_chip_generic can have multiple irq_chip_type * implementations which can be associated to a particular irq line of * an irq_chip_generic instance. That allows to share and protect * state in an irq_chip_generic instance when we need to implement * different flow mechanisms (level/edge) for it. */ struct irq_chip_generic { raw_spinlock_t lock; void __iomem *reg_base; u32 (*reg_readl)(void __iomem *addr); void (*reg_writel)(u32 val, void __iomem *addr); void (*suspend)(struct irq_chip_generic *gc); void (*resume)(struct irq_chip_generic *gc); unsigned int irq_base; unsigned int irq_cnt; u32 mask_cache; u32 wake_enabled; u32 wake_active; unsigned int num_ct; void *private; unsigned long installed; unsigned long unused; struct irq_domain *domain; struct list_head list; struct irq_chip_type chip_types[]; }; /** * enum irq_gc_flags - Initialization flags for generic irq chips * @IRQ_GC_INIT_MASK_CACHE: Initialize the mask_cache by reading mask reg * @IRQ_GC_INIT_NESTED_LOCK: Set the lock class of the irqs to nested for * irq chips which need to call irq_set_wake() on * the parent irq. Usually GPIO implementations * @IRQ_GC_MASK_CACHE_PER_TYPE: Mask cache is chip type private * @IRQ_GC_NO_MASK: Do not calculate irq_data->mask * @IRQ_GC_BE_IO: Use big-endian register accesses (default: LE) */ enum irq_gc_flags { IRQ_GC_INIT_MASK_CACHE = 1 << 0, IRQ_GC_INIT_NESTED_LOCK = 1 << 1, IRQ_GC_MASK_CACHE_PER_TYPE = 1 << 2, IRQ_GC_NO_MASK = 1 << 3, IRQ_GC_BE_IO = 1 << 4, }; /* * struct irq_domain_chip_generic - Generic irq chip data structure for irq domains * @irqs_per_chip: Number of interrupts per chip * @num_chips: Number of chips * @irq_flags_to_set: IRQ* flags to set on irq setup * @irq_flags_to_clear: IRQ* flags to clear on irq setup * @gc_flags: Generic chip specific setup flags * @exit: Function called on each chip when they are destroyed. * @gc: Array of pointers to generic interrupt chips */ struct irq_domain_chip_generic { unsigned int irqs_per_chip; unsigned int num_chips; unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; void (*exit)(struct irq_chip_generic *gc); struct irq_chip_generic *gc[]; }; /** * struct irq_domain_chip_generic_info - Generic chip information structure * @name: Name of the generic interrupt chip * @handler: Interrupt handler used by the generic interrupt chip * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with each * chip * @irq_flags_to_clear: IRQ_* bits to clear in the mapping function * @irq_flags_to_set: IRQ_* bits to set in the mapping function * @gc_flags: Generic chip specific setup flags * @init: Function called on each chip when they are created. * Allow to do some additional chip initialisation. * @exit: Function called on each chip when they are destroyed. * Allow to do some chip cleanup operation. */ struct irq_domain_chip_generic_info { const char *name; irq_flow_handler_t handler; unsigned int irqs_per_chip; unsigned int num_ct; unsigned int irq_flags_to_clear; unsigned int irq_flags_to_set; enum irq_gc_flags gc_flags; int (*init)(struct irq_chip_generic *gc); void (*exit)(struct irq_chip_generic *gc); }; /* Generic chip callback functions */ void irq_gc_noop(struct irq_data *d); void irq_gc_mask_disable_reg(struct irq_data *d); void irq_gc_mask_set_bit(struct irq_data *d); void irq_gc_mask_clr_bit(struct irq_data *d); void irq_gc_unmask_enable_reg(struct irq_data *d); void irq_gc_ack_set_bit(struct irq_data *d); void irq_gc_ack_clr_bit(struct irq_data *d); void irq_gc_mask_disable_and_ack_set(struct irq_data *d); void irq_gc_eoi(struct irq_data *d); int irq_gc_set_wake(struct irq_data *d, unsigned int on); /* Setup functions for irq_chip_generic */ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw_irq); void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq); struct irq_chip_generic * irq_alloc_generic_chip(const char *name, int nr_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); struct irq_chip_generic * devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, u32 msk, enum irq_gc_flags flags, unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); #ifdef CONFIG_GENERIC_IRQ_CHIP int irq_domain_alloc_generic_chips(struct irq_domain *d, const struct irq_domain_chip_generic_info *info); void irq_domain_remove_generic_chips(struct irq_domain *d); #else static inline int irq_domain_alloc_generic_chips(struct irq_domain *d, const struct irq_domain_chip_generic_info *info) { return -EINVAL; } static inline void irq_domain_remove_generic_chips(struct irq_domain *d) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, int num_ct, const char *name, irq_flow_handler_t handler, unsigned int clr, unsigned int set, enum irq_gc_flags flags); #define irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name, \ handler, clr, set, flags) \ ({ \ MAYBE_BUILD_BUG_ON(irqs_per_chip > 32); \ __irq_alloc_domain_generic_chips(d, irqs_per_chip, num_ct, name,\ handler, clr, set, flags); \ }) static inline void irq_free_generic_chip(struct irq_chip_generic *gc) { kfree(gc); } static inline void irq_destroy_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set) { irq_remove_generic_chip(gc, msk, clr, set); irq_free_generic_chip(gc); } static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) { return container_of(d->chip, struct irq_chip_type, chip); } #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { if (gc->reg_writel) gc->reg_writel(val, gc->reg_base + reg_offset); else writel(val, gc->reg_base + reg_offset); } static inline u32 irq_reg_readl(struct irq_chip_generic *gc, int reg_offset) { if (gc->reg_readl) return gc->reg_readl(gc->reg_base + reg_offset); else return readl(gc->reg_base + reg_offset); } struct irq_matrix; struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits, unsigned int alloc_start, unsigned int alloc_end); void irq_matrix_online(struct irq_matrix *m); void irq_matrix_offline(struct irq_matrix *m); void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace); int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk); void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk); int irq_matrix_alloc_managed(struct irq_matrix *m, const struct cpumask *msk, unsigned int *mapped_cpu); void irq_matrix_reserve(struct irq_matrix *m); void irq_matrix_remove_reserved(struct irq_matrix *m); int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu); void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, unsigned int bit, bool managed); void irq_matrix_assign(struct irq_matrix *m, unsigned int bit); unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown); unsigned int irq_matrix_allocated(struct irq_matrix *m); unsigned int irq_matrix_reserved(struct irq_matrix *m); void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind); /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */ #define INVALID_HWIRQ (~0UL) irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu); int __ipi_send_single(struct irq_desc *desc, unsigned int cpu); int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest); int ipi_send_single(unsigned int virq, unsigned int cpu); int ipi_send_mask(unsigned int virq, const struct cpumask *dest); void ipi_mux_process(void); int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)); #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER /* * Registers a generic IRQ handling function as the top-level IRQ handler in * the system, which is generally the first C code called from an assembly * architecture-specific interrupt handler. * * Returns 0 on success, or -EBUSY if an IRQ handler has already been * registered. */ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)); /* * Allows interrupt handlers to find the irqchip that's been registered as the * top-level IRQ handler. */ extern void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; asmlinkage void generic_handle_arch_irq(struct pt_regs *regs); #else #ifndef set_handle_irq #define set_handle_irq(handle_irq) \ do { \ (void)handle_irq; \ WARN_ON(1); \ } while (0) #endif #endif #endif /* _LINUX_IRQ_H */ |
| 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 | // SPDX-License-Identifier: GPL-2.0-only /* * Infrastructure for migratable timers * * Copyright(C) 2022 linutronix GmbH */ #include <linux/cpuhotplug.h> #include <linux/slab.h> #include <linux/smp.h> #include <linux/spinlock.h> #include <linux/timerqueue.h> #include <trace/events/ipi.h> #include "timer_migration.h" #include "tick-internal.h" #define CREATE_TRACE_POINTS #include <trace/events/timer_migration.h> /* * The timer migration mechanism is built on a hierarchy of groups. The * lowest level group contains CPUs, the next level groups of CPU groups * and so forth. The CPU groups are kept per node so for the normal case * lock contention won't happen across nodes. Depending on the number of * CPUs per node even the next level might be kept as groups of CPU groups * per node and only the levels above cross the node topology. * * Example topology for a two node system with 24 CPUs each. * * LVL 2 [GRP2:0] * GRP1:0 = GRP1:M * * LVL 1 [GRP1:0] [GRP1:1] * GRP0:0 - GRP0:2 GRP0:3 - GRP0:5 * * LVL 0 [GRP0:0] [GRP0:1] [GRP0:2] [GRP0:3] [GRP0:4] [GRP0:5] * CPUS 0-7 8-15 16-23 24-31 32-39 40-47 * * The groups hold a timer queue of events sorted by expiry time. These * queues are updated when CPUs go in idle. When they come out of idle * ignore flag of events is set. * * Each group has a designated migrator CPU/group as long as a CPU/group is * active in the group. This designated role is necessary to avoid that all * active CPUs in a group try to migrate expired timers from other CPUs, * which would result in massive lock bouncing. * * When a CPU is awake, it checks in it's own timer tick the group * hierarchy up to the point where it is assigned the migrator role or if * no CPU is active, it also checks the groups where no migrator is set * (TMIGR_NONE). * * If it finds expired timers in one of the group queues it pulls them over * from the idle CPU and runs the timer function. After that it updates the * group and the parent groups if required. * * CPUs which go idle arm their CPU local timer hardware for the next local * (pinned) timer event. If the next migratable timer expires after the * next local timer or the CPU has no migratable timer pending then the * CPU does not queue an event in the LVL0 group. If the next migratable * timer expires before the next local timer then the CPU queues that timer * in the LVL0 group. In both cases the CPU marks itself idle in the LVL0 * group. * * When CPU comes out of idle and when a group has at least a single active * child, the ignore flag of the tmigr_event is set. This indicates, that * the event is ignored even if it is still enqueued in the parent groups * timer queue. It will be removed when touching the timer queue the next * time. This spares locking in active path as the lock protects (after * setup) only event information. For more information about locking, * please read the section "Locking rules". * * If the CPU is the migrator of the group then it delegates that role to * the next active CPU in the group or sets migrator to TMIGR_NONE when * there is no active CPU in the group. This delegation needs to be * propagated up the hierarchy so hand over from other leaves can happen at * all hierarchy levels w/o doing a search. * * When the last CPU in the system goes idle, then it drops all migrator * duties up to the top level of the hierarchy (LVL2 in the example). It * then has to make sure, that it arms it's own local hardware timer for * the earliest event in the system. * * * Lifetime rules: * --------------- * * The groups are built up at init time or when CPUs come online. They are * not destroyed when a group becomes empty due to offlining. The group * just won't participate in the hierarchy management anymore. Destroying * groups would result in interesting race conditions which would just make * the whole mechanism slow and complex. * * * Locking rules: * -------------- * * For setting up new groups and handling events it's required to lock both * child and parent group. The lock ordering is always bottom up. This also * includes the per CPU locks in struct tmigr_cpu. For updating the migrator and * active CPU/group information atomic_try_cmpxchg() is used instead and only * the per CPU tmigr_cpu->lock is held. * * During the setup of groups tmigr_level_list is required. It is protected by * @tmigr_mutex. * * When @timer_base->lock as well as tmigr related locks are required, the lock * ordering is: first @timer_base->lock, afterwards tmigr related locks. * * * Protection of the tmigr group state information: * ------------------------------------------------ * * The state information with the list of active children and migrator needs to * be protected by a sequence counter. It prevents a race when updates in child * groups are propagated in changed order. The state update is performed * lockless and group wise. The following scenario describes what happens * without updating the sequence counter: * * Therefore, let's take three groups and four CPUs (CPU2 and CPU3 as well * as GRP0:1 will not change during the scenario): * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:0, GRP0:1 * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = CPU0 migrator = CPU2 * active = CPU0 active = CPU2 * / \ / \ * CPUs 0 1 2 3 * active idle active idle * * * 1. CPU0 goes idle. As the update is performed group wise, in the first step * only GRP0:0 is updated. The update of GRP1:0 is pending as CPU0 has to * walk the hierarchy. * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:0, GRP0:1 * / \ * LVL 0 [GRP0:0] [GRP0:1] * --> migrator = TMIGR_NONE migrator = CPU2 * --> active = active = CPU2 * / \ / \ * CPUs 0 1 2 3 * --> idle idle active idle * * 2. While CPU0 goes idle and continues to update the state, CPU1 comes out of * idle. CPU1 updates GRP0:0. The update for GRP1:0 is pending as CPU1 also * has to walk the hierarchy. Both CPUs (CPU0 and CPU1) now walk the * hierarchy to perform the needed update from their point of view. The * currently visible state looks the following: * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:0, GRP0:1 * / \ * LVL 0 [GRP0:0] [GRP0:1] * --> migrator = CPU1 migrator = CPU2 * --> active = CPU1 active = CPU2 * / \ / \ * CPUs 0 1 2 3 * idle --> active active idle * * 3. Here is the race condition: CPU1 managed to propagate its changes (from * step 2) through the hierarchy to GRP1:0 before CPU0 (step 1) did. The * active members of GRP1:0 remain unchanged after the update since it is * still valid from CPU1 current point of view: * * LVL 1 [GRP1:0] * --> migrator = GRP0:1 * --> active = GRP0:0, GRP0:1 * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = CPU1 migrator = CPU2 * active = CPU1 active = CPU2 * / \ / \ * CPUs 0 1 2 3 * idle active active idle * * 4. Now CPU0 finally propagates its changes (from step 1) to GRP1:0. * * LVL 1 [GRP1:0] * --> migrator = GRP0:1 * --> active = GRP0:1 * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = CPU1 migrator = CPU2 * active = CPU1 active = CPU2 * / \ / \ * CPUs 0 1 2 3 * idle active active idle * * * The race of CPU0 vs. CPU1 led to an inconsistent state in GRP1:0. CPU1 is * active and is correctly listed as active in GRP0:0. However GRP1:0 does not * have GRP0:0 listed as active, which is wrong. The sequence counter has been * added to avoid inconsistent states during updates. The state is updated * atomically only if all members, including the sequence counter, match the * expected value (compare-and-exchange). * * Looking back at the previous example with the addition of the sequence * counter: The update as performed by CPU0 in step 4 will fail. CPU1 changed * the sequence number during the update in step 3 so the expected old value (as * seen by CPU0 before starting the walk) does not match. * * Prevent race between new event and last CPU going inactive * ---------------------------------------------------------- * * When the last CPU is going idle and there is a concurrent update of a new * first global timer of an idle CPU, the group and child states have to be read * while holding the lock in tmigr_update_events(). The following scenario shows * what happens, when this is not done. * * 1. Only CPU2 is active: * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:1 * next_expiry = KTIME_MAX * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE migrator = CPU2 * active = active = CPU2 * next_expiry = KTIME_MAX next_expiry = KTIME_MAX * / \ / \ * CPUs 0 1 2 3 * idle idle active idle * * 2. Now CPU 2 goes idle (and has no global timer, that has to be handled) and * propagates that to GRP0:1: * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:1 * next_expiry = KTIME_MAX * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE --> migrator = TMIGR_NONE * active = --> active = * next_expiry = KTIME_MAX next_expiry = KTIME_MAX * / \ / \ * CPUs 0 1 2 3 * idle idle --> idle idle * * 3. Now the idle state is propagated up to GRP1:0. As this is now the last * child going idle in top level group, the expiry of the next group event * has to be handed back to make sure no event is lost. As there is no event * enqueued, KTIME_MAX is handed back to CPU2. * * LVL 1 [GRP1:0] * --> migrator = TMIGR_NONE * --> active = * next_expiry = KTIME_MAX * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE migrator = TMIGR_NONE * active = active = * next_expiry = KTIME_MAX next_expiry = KTIME_MAX * / \ / \ * CPUs 0 1 2 3 * idle idle --> idle idle * * 4. CPU 0 has a new timer queued from idle and it expires at TIMER0. CPU0 * propagates that to GRP0:0: * * LVL 1 [GRP1:0] * migrator = TMIGR_NONE * active = * next_expiry = KTIME_MAX * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE migrator = TMIGR_NONE * active = active = * --> next_expiry = TIMER0 next_expiry = KTIME_MAX * / \ / \ * CPUs 0 1 2 3 * idle idle idle idle * * 5. GRP0:0 is not active, so the new timer has to be propagated to * GRP1:0. Therefore the GRP1:0 state has to be read. When the stalled value * (from step 2) is read, the timer is enqueued into GRP1:0, but nothing is * handed back to CPU0, as it seems that there is still an active child in * top level group. * * LVL 1 [GRP1:0] * migrator = TMIGR_NONE * active = * --> next_expiry = TIMER0 * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE migrator = TMIGR_NONE * active = active = * next_expiry = TIMER0 next_expiry = KTIME_MAX * / \ / \ * CPUs 0 1 2 3 * idle idle idle idle * * This is prevented by reading the state when holding the lock (when a new * timer has to be propagated from idle path):: * * CPU2 (tmigr_inactive_up()) CPU0 (tmigr_new_timer_up()) * -------------------------- --------------------------- * // step 3: * cmpxchg(&GRP1:0->state); * tmigr_update_events() { * spin_lock(&GRP1:0->lock); * // ... update events ... * // hand back first expiry when GRP1:0 is idle * spin_unlock(&GRP1:0->lock); * // ^^^ release state modification * } * tmigr_update_events() { * spin_lock(&GRP1:0->lock) * // ^^^ acquire state modification * group_state = atomic_read(&GRP1:0->state) * // .... update events ... * // hand back first expiry when GRP1:0 is idle * spin_unlock(&GRP1:0->lock) <3> * // ^^^ makes state visible for other * // callers of tmigr_new_timer_up() * } * * When CPU0 grabs the lock directly after cmpxchg, the first timer is reported * back to CPU0 and also later on to CPU2. So no timer is missed. A concurrent * update of the group state from active path is no problem, as the upcoming CPU * will take care of the group events. * * Required event and timerqueue update after a remote expiry: * ----------------------------------------------------------- * * After expiring timers of a remote CPU, a walk through the hierarchy and * update of events and timerqueues is required. It is obviously needed if there * is a 'new' global timer but also if there is no new global timer but the * remote CPU is still idle. * * 1. CPU0 and CPU1 are idle and have both a global timer expiring at the same * time. So both have an event enqueued in the timerqueue of GRP0:0. CPU3 is * also idle and has no global timer pending. CPU2 is the only active CPU and * thus also the migrator: * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:1 * --> timerqueue = evt-GRP0:0 * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE migrator = CPU2 * active = active = CPU2 * groupevt.ignore = false groupevt.ignore = true * groupevt.cpu = CPU0 groupevt.cpu = * timerqueue = evt-CPU0, timerqueue = * evt-CPU1 * / \ / \ * CPUs 0 1 2 3 * idle idle active idle * * 2. CPU2 starts to expire remote timers. It starts with LVL0 group * GRP0:1. There is no event queued in the timerqueue, so CPU2 continues with * the parent of GRP0:1: GRP1:0. In GRP1:0 it dequeues the first event. It * looks at tmigr_event::cpu struct member and expires the pending timer(s) * of CPU0. * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:1 * --> timerqueue = * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE migrator = CPU2 * active = active = CPU2 * groupevt.ignore = false groupevt.ignore = true * --> groupevt.cpu = CPU0 groupevt.cpu = * timerqueue = evt-CPU0, timerqueue = * evt-CPU1 * / \ / \ * CPUs 0 1 2 3 * idle idle active idle * * 3. Some work has to be done after expiring the timers of CPU0. If we stop * here, then CPU1's pending global timer(s) will not expire in time and the * timerqueue of GRP0:0 has still an event for CPU0 enqueued which has just * been processed. So it is required to walk the hierarchy from CPU0's point * of view and update it accordingly. CPU0's event will be removed from the * timerqueue because it has no pending timer. If CPU0 would have a timer * pending then it has to expire after CPU1's first timer because all timers * from this period were just expired. Either way CPU1's event will be first * in GRP0:0's timerqueue and therefore set in the CPU field of the group * event which is then enqueued in GRP1:0's timerqueue as GRP0:0 is still not * active: * * LVL 1 [GRP1:0] * migrator = GRP0:1 * active = GRP0:1 * --> timerqueue = evt-GRP0:0 * / \ * LVL 0 [GRP0:0] [GRP0:1] * migrator = TMIGR_NONE migrator = CPU2 * active = active = CPU2 * groupevt.ignore = false groupevt.ignore = true * --> groupevt.cpu = CPU1 groupevt.cpu = * --> timerqueue = evt-CPU1 timerqueue = * / \ / \ * CPUs 0 1 2 3 * idle idle active idle * * Now CPU2 (migrator) will continue step 2 at GRP1:0 and will expire the * timer(s) of CPU1. * * The hierarchy walk in step 3 can be skipped if the migrator notices that a * CPU of GRP0:0 is active again. The CPU will mark GRP0:0 active and take care * of the group as migrator and any needed updates within the hierarchy. */ static DEFINE_MUTEX(tmigr_mutex); static struct list_head *tmigr_level_list __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); #define TMIGR_NONE 0xFF #define BIT_CNT 8 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) { return !(tmc->tmgroup && tmc->online); } /* * Returns true, when @childmask corresponds to the group migrator or when the * group is not active - so no migrator is set. */ static bool tmigr_check_migrator(struct tmigr_group *group, u8 childmask) { union tmigr_state s; s.state = atomic_read(&group->migr_state); if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) return true; return false; } static bool tmigr_check_migrator_and_lonely(struct tmigr_group *group, u8 childmask) { bool lonely, migrator = false; unsigned long active; union tmigr_state s; s.state = atomic_read(&group->migr_state); if ((s.migrator == childmask) || (s.migrator == TMIGR_NONE)) migrator = true; active = s.active; lonely = bitmap_weight(&active, BIT_CNT) <= 1; return (migrator && lonely); } static bool tmigr_check_lonely(struct tmigr_group *group) { unsigned long active; union tmigr_state s; s.state = atomic_read(&group->migr_state); active = s.active; return bitmap_weight(&active, BIT_CNT) <= 1; } /** * struct tmigr_walk - data required for walking the hierarchy * @nextexp: Next CPU event expiry information which is handed into * the timer migration code by the timer code * (get_next_timer_interrupt()) * @firstexp: Contains the first event expiry information when * hierarchy is completely idle. When CPU itself was the * last going idle, information makes sure, that CPU will * be back in time. When using this value in the remote * expiry case, firstexp is stored in the per CPU tmigr_cpu * struct of CPU which expires remote timers. It is updated * in top level group only. Be aware, there could occur a * new top level of the hierarchy between the 'top level * call' in tmigr_update_events() and the check for the * parent group in walk_groups(). Then @firstexp might * contain a value != KTIME_MAX even if it was not the * final top level. This is not a problem, as the worst * outcome is a CPU which might wake up a little early. * @evt: Pointer to tmigr_event which needs to be queued (of idle * child group) * @childmask: groupmask of child group * @remote: Is set, when the new timer path is executed in * tmigr_handle_remote_cpu() * @basej: timer base in jiffies * @now: timer base monotonic * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only * @tmc_active: this flag indicates, whether the CPU which triggers * the hierarchy walk is !idle in the timer migration * hierarchy. When the CPU is idle and the whole hierarchy is * idle, only the first event of the top level has to be * considered. */ struct tmigr_walk { u64 nextexp; u64 firstexp; struct tmigr_event *evt; u8 childmask; bool remote; unsigned long basej; u64 now; bool check; bool tmc_active; }; typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); static void __walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { struct tmigr_group *child = NULL, *group = tmc->tmgroup; do { WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); if (up(group, child, data)) break; child = group; /* * Pairs with the store release on group connection * to make sure group initialization is visible. */ group = READ_ONCE(group->parent); data->childmask = child->groupmask; WARN_ON_ONCE(!data->childmask); } while (group); } static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); __walk_groups(up, data, tmc); } /* * Returns the next event of the timerqueue @group->events * * Removes timers with ignore flag and update next_expiry of the group. Values * of the group event are updated in tmigr_update_events() only. */ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) { struct timerqueue_node *node = NULL; struct tmigr_event *evt = NULL; lockdep_assert_held(&group->lock); WRITE_ONCE(group->next_expiry, KTIME_MAX); while ((node = timerqueue_getnext(&group->events))) { evt = container_of(node, struct tmigr_event, nextevt); if (!READ_ONCE(evt->ignore)) { WRITE_ONCE(group->next_expiry, evt->nextevt.expires); return evt; } /* * Remove next timers with ignore flag, because the group lock * is held anyway */ if (!timerqueue_del(&group->events, node)) break; } return NULL; } /* * Return the next event (with the expiry equal or before @now) * * Event, which is returned, is also removed from the queue. */ static struct tmigr_event *tmigr_next_expired_groupevt(struct tmigr_group *group, u64 now) { struct tmigr_event *evt = tmigr_next_groupevt(group); if (!evt || now < evt->nextevt.expires) return NULL; /* * The event is ready to expire. Remove it and update next group event. */ timerqueue_del(&group->events, &evt->nextevt); tmigr_next_groupevt(group); return evt; } static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) { struct tmigr_event *evt; evt = tmigr_next_groupevt(group); if (!evt) return KTIME_MAX; else return evt->nextevt.expires; } static bool tmigr_active_up(struct tmigr_group *group, struct tmigr_group *child, struct tmigr_walk *data) { union tmigr_state curstate, newstate; bool walk_done; u8 childmask; childmask = data->childmask; /* * No memory barrier is required here in contrast to * tmigr_inactive_up(), as the group state change does not depend on the * child state. */ curstate.state = atomic_read(&group->migr_state); do { newstate = curstate; walk_done = true; if (newstate.migrator == TMIGR_NONE) { newstate.migrator = childmask; /* Changes need to be propagated */ walk_done = false; } newstate.active |= childmask; newstate.seq++; } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); trace_tmigr_group_set_cpu_active(group, newstate, childmask); /* * The group is active (again). The group event might be still queued * into the parent group's timerqueue but can now be handled by the * migrator of this group. Therefore the ignore flag for the group event * is updated to reflect this. * * The update of the ignore flag in the active path is done lockless. In * worst case the migrator of the parent group observes the change too * late and expires remotely all events belonging to this group. The * lock is held while updating the ignore flag in idle path. So this * state change will not be lost. */ WRITE_ONCE(group->groupevt.ignore, true); return walk_done; } static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) { struct tmigr_walk data; data.childmask = tmc->groupmask; trace_tmigr_cpu_active(tmc); tmc->cpuevt.ignore = true; WRITE_ONCE(tmc->wakeup, KTIME_MAX); walk_groups(&tmigr_active_up, &data, tmc); } /** * tmigr_cpu_activate() - set this CPU active in timer migration hierarchy * * Call site timer_clear_idle() is called with interrupts disabled. */ void tmigr_cpu_activate(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); if (tmigr_is_not_available(tmc)) return; if (WARN_ON_ONCE(!tmc->idle)) return; raw_spin_lock(&tmc->lock); tmc->idle = false; __tmigr_cpu_activate(tmc); raw_spin_unlock(&tmc->lock); } /* * Returns true, if there is nothing to be propagated to the next level * * @data->firstexp is set to expiry of first gobal event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race * against a concurrent tmigr_inactive_up() run when the last CPU goes idle. See * also section "Prevent race between new event and last CPU going inactive" in * the documentation at the top. * * This is the only place where the group event expiry value is set. */ static bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, struct tmigr_walk *data) { struct tmigr_event *evt, *first_childevt; union tmigr_state childstate, groupstate; bool remote = data->remote; bool walk_done = false; bool ignore; u64 nextexp; if (child) { raw_spin_lock(&child->lock); raw_spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING); childstate.state = atomic_read(&child->migr_state); groupstate.state = atomic_read(&group->migr_state); if (childstate.active) { walk_done = true; goto unlock; } first_childevt = tmigr_next_groupevt(child); nextexp = child->next_expiry; evt = &child->groupevt; /* * This can race with concurrent idle exit (activate). * If the current writer wins, a useless remote expiration may * be scheduled. If the activate wins, the event is properly * ignored. */ ignore = (nextexp == KTIME_MAX) ? true : false; WRITE_ONCE(evt->ignore, ignore); } else { nextexp = data->nextexp; first_childevt = evt = data->evt; ignore = evt->ignore; /* * Walking the hierarchy is required in any case when a * remote expiry was done before. This ensures to not lose * already queued events in non active groups (see section * "Required event and timerqueue update after a remote * expiry" in the documentation at the top). * * The two call sites which are executed without a remote expiry * before, are not prevented from propagating changes through * the hierarchy by the return: * - When entering this path by tmigr_new_timer(), @evt->ignore * is never set. * - tmigr_inactive_up() takes care of the propagation by * itself and ignores the return value. But an immediate * return is possible if there is a parent, sparing group * locking at this level, because the upper walking call to * the parent will take care about removing this event from * within the group and update next_expiry accordingly. * * However if there is no parent, ie: the hierarchy has only a * single level so @group is the top level group, make sure the * first event information of the group is updated properly and * also handled properly, so skip this fast return path. */ if (ignore && !remote && group->parent) return true; raw_spin_lock(&group->lock); childstate.state = 0; groupstate.state = atomic_read(&group->migr_state); } /* * If the child event is already queued in the group, remove it from the * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { if ((evt->nextevt.expires == nextexp) && !ignore) { /* Make sure not to miss a new CPU event with the same expiry */ evt->cpu = first_childevt->cpu; goto check_toplvl; } if (!timerqueue_del(&group->events, &evt->nextevt)) WRITE_ONCE(group->next_expiry, KTIME_MAX); } if (ignore) { /* * When the next child event could be ignored (nextexp is * KTIME_MAX) and there was no remote timer handling before or * the group is already active, there is no need to walk the * hierarchy even if there is a parent group. * * The other way round: even if the event could be ignored, but * if a remote timer handling was executed before and the group * is not active, walking the hierarchy is required to not miss * an enqueued timer in the non active group. The enqueued timer * of the group needs to be propagated to a higher level to * ensure it is handled. */ if (!remote || groupstate.active) walk_done = true; } else { evt->nextevt.expires = nextexp; evt->cpu = first_childevt->cpu; if (timerqueue_add(&group->events, &evt->nextevt)) WRITE_ONCE(group->next_expiry, nextexp); } check_toplvl: if (!group->parent && (groupstate.migrator == TMIGR_NONE)) { walk_done = true; /* * Nothing to do when update was done during remote timer * handling. First timer in top level group which needs to be * handled when top level group is not active, is calculated * directly in tmigr_handle_remote_up(). */ if (remote) goto unlock; /* * The top level group is idle and it has to be ensured the * global timers are handled in time. (This could be optimized * by keeping track of the last global scheduled event and only * arming it on the CPU if the new event is earlier. Not sure if * its worth the complexity.) */ data->firstexp = tmigr_next_groupevt_expires(group); } trace_tmigr_update_events(child, group, childstate, groupstate, nextexp); unlock: raw_spin_unlock(&group->lock); if (child) raw_spin_unlock(&child->lock); return walk_done; } static bool tmigr_new_timer_up(struct tmigr_group *group, struct tmigr_group *child, struct tmigr_walk *data) { return tmigr_update_events(group, child, data); } /* * Returns the expiry of the next timer that needs to be handled. KTIME_MAX is * returned, if an active CPU will handle all the timer migration hierarchy * timers. */ static u64 tmigr_new_timer(struct tmigr_cpu *tmc, u64 nextexp) { struct tmigr_walk data = { .nextexp = nextexp, .firstexp = KTIME_MAX, .evt = &tmc->cpuevt }; lockdep_assert_held(&tmc->lock); if (tmc->remote) return KTIME_MAX; trace_tmigr_cpu_new_timer(tmc); tmc->cpuevt.ignore = false; data.remote = false; walk_groups(&tmigr_new_timer_up, &data, tmc); /* If there is a new first global event, make sure it is handled */ return data.firstexp; } static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, unsigned long jif) { struct timer_events tevt; struct tmigr_walk data; struct tmigr_cpu *tmc; tmc = per_cpu_ptr(&tmigr_cpu, cpu); raw_spin_lock_irq(&tmc->lock); /* * If the remote CPU is offline then the timers have been migrated to * another CPU. * * If tmigr_cpu::remote is set, at the moment another CPU already * expires the timers of the remote CPU. * * If tmigr_event::ignore is set, then the CPU returns from idle and * takes care of its timers. * * If the next event expires in the future, then the event has been * updated and there are no timers to expire right now. The CPU which * updated the event takes care when hierarchy is completely * idle. Otherwise the migrator does it as the event is enqueued. */ if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || now < tmc->cpuevt.nextevt.expires) { raw_spin_unlock_irq(&tmc->lock); return; } trace_tmigr_handle_remote_cpu(tmc); tmc->remote = true; WRITE_ONCE(tmc->wakeup, KTIME_MAX); /* Drop the lock to allow the remote CPU to exit idle */ raw_spin_unlock_irq(&tmc->lock); if (cpu != smp_processor_id()) timer_expire_remote(cpu); /* * Lock ordering needs to be preserved - timer_base locks before tmigr * related locks (see section "Locking rules" in the documentation at * the top). During fetching the next timer interrupt, also tmc->lock * needs to be held. Otherwise there is a possible race window against * the CPU itself when it comes out of idle, updates the first timer in * the hierarchy and goes back to idle. * * timer base locks are dropped as fast as possible: After checking * whether the remote CPU went offline in the meantime and after * fetching the next remote timer interrupt. Dropping the locks as fast * as possible keeps the locking region small and prevents holding * several (unnecessary) locks during walking the hierarchy for updating * the timerqueue and group events. */ local_irq_disable(); timer_lock_remote_bases(cpu); raw_spin_lock(&tmc->lock); /* * When the CPU went offline in the meantime, no hierarchy walk has to * be done for updating the queued events, because the walk was * already done during marking the CPU offline in the hierarchy. * * When the CPU is no longer idle, the CPU takes care of the timers and * also of the timers in the hierarchy. * * (See also section "Required event and timerqueue update after a * remote expiry" in the documentation at the top) */ if (!tmc->online || !tmc->idle) { timer_unlock_remote_bases(cpu); goto unlock; } /* next event of CPU */ fetch_next_timer_interrupt_remote(jif, now, &tevt, cpu); timer_unlock_remote_bases(cpu); data.nextexp = tevt.global; data.firstexp = KTIME_MAX; data.evt = &tmc->cpuevt; data.remote = true; /* * The update is done even when there is no 'new' global timer pending * on the remote CPU (see section "Required event and timerqueue update * after a remote expiry" in the documentation at the top) */ walk_groups(&tmigr_new_timer_up, &data, tmc); unlock: tmc->remote = false; raw_spin_unlock_irq(&tmc->lock); } static bool tmigr_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, struct tmigr_walk *data) { struct tmigr_event *evt; unsigned long jif; u8 childmask; u64 now; jif = data->basej; now = data->now; childmask = data->childmask; trace_tmigr_handle_remote(group); again: /* * Handle the group only if @childmask is the migrator or if the * group has no migrator. Otherwise the group is active and is * handled by its own migrator. */ if (!tmigr_check_migrator(group, childmask)) return true; raw_spin_lock_irq(&group->lock); evt = tmigr_next_expired_groupevt(group, now); if (evt) { unsigned int remote_cpu = evt->cpu; raw_spin_unlock_irq(&group->lock); tmigr_handle_remote_cpu(remote_cpu, now, jif); /* check if there is another event, that needs to be handled */ goto again; } /* * Keep track of the expiry of the first event that needs to be handled * (group->next_expiry was updated by tmigr_next_expired_groupevt(), * next was set by tmigr_handle_remote_cpu()). */ data->firstexp = group->next_expiry; raw_spin_unlock_irq(&group->lock); return false; } /** * tmigr_handle_remote() - Handle global timers of remote idle CPUs * * Called from the timer soft interrupt with interrupts enabled. */ void tmigr_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_walk data; if (tmigr_is_not_available(tmc)) return; data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; /* * NOTE: This is a doubled check because the migrator test will be done * in tmigr_handle_remote_up() anyway. Keep this check to speed up the * return when nothing has to be done. */ if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) { /* * If this CPU was an idle migrator, make sure to clear its wakeup * value so it won't chase timers that have already expired elsewhere. * This avoids endless requeue from tmigr_new_timer(). */ if (READ_ONCE(tmc->wakeup) == KTIME_MAX) return; } data.now = get_jiffies_update(&data.basej); /* * Update @tmc->wakeup only at the end and do not reset @tmc->wakeup to * KTIME_MAX. Even if tmc->lock is not held during the whole remote * handling, tmc->wakeup is fine to be stale as it is called in * interrupt context and tick_nohz_next_event() is executed in interrupt * exit path only after processing the last pending interrupt. */ __walk_groups(&tmigr_handle_remote_up, &data, tmc); raw_spin_lock_irq(&tmc->lock); WRITE_ONCE(tmc->wakeup, data.firstexp); raw_spin_unlock_irq(&tmc->lock); } static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, struct tmigr_walk *data) { u8 childmask; childmask = data->childmask; /* * Handle the group only if the child is the migrator or if the group * has no migrator. Otherwise the group is active and is handled by its * own migrator. */ if (!tmigr_check_migrator(group, childmask)) return true; /* * When there is a parent group and the CPU which triggered the * hierarchy walk is not active, proceed the walk to reach the top level * group before reading the next_expiry value. */ if (group->parent && !data->tmc_active) return false; /* * The lock is required on 32bit architectures to read the variable * consistently with a concurrent writer. On 64bit the lock is not * required because the read operation is not split and so it is always * consistent. */ if (IS_ENABLED(CONFIG_64BIT)) { data->firstexp = READ_ONCE(group->next_expiry); if (data->now >= data->firstexp) { data->check = true; return true; } } else { raw_spin_lock(&group->lock); data->firstexp = group->next_expiry; if (data->now >= group->next_expiry) { data->check = true; raw_spin_unlock(&group->lock); return true; } raw_spin_unlock(&group->lock); } return false; } /** * tmigr_requires_handle_remote() - Check the need of remote timer handling * * Must be called with interrupts disabled. */ bool tmigr_requires_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_walk data; unsigned long jif; bool ret = false; if (tmigr_is_not_available(tmc)) return ret; data.now = get_jiffies_update(&jif); data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; data.tmc_active = !tmc->idle; data.check = false; /* * If the CPU is active, walk the hierarchy to check whether a remote * expiry is required. * * Check is done lockless as interrupts are disabled and @tmc->idle is * set only by the local CPU. */ if (!tmc->idle) { __walk_groups(&tmigr_requires_handle_remote_up, &data, tmc); return data.check; } /* * When the CPU is idle, compare @tmc->wakeup with @data.now. The lock * is required on 32bit architectures to read the variable consistently * with a concurrent writer. On 64bit the lock is not required because * the read operation is not split and so it is always consistent. */ if (IS_ENABLED(CONFIG_64BIT)) { if (data.now >= READ_ONCE(tmc->wakeup)) return true; } else { raw_spin_lock(&tmc->lock); if (data.now >= tmc->wakeup) ret = true; raw_spin_unlock(&tmc->lock); } return ret; } /** * tmigr_cpu_new_timer() - enqueue next global timer into hierarchy (idle tmc) * @nextexp: Next expiry of global timer (or KTIME_MAX if not) * * The CPU is already deactivated in the timer migration * hierarchy. tick_nohz_get_sleep_length() calls tick_nohz_next_event() * and thereby the timer idle path is executed once more. @tmc->wakeup * holds the first timer, when the timer migration hierarchy is * completely idle. * * Returns the first timer that needs to be handled by this CPU or KTIME_MAX if * nothing needs to be done. */ u64 tmigr_cpu_new_timer(u64 nextexp) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); u64 ret; if (tmigr_is_not_available(tmc)) return nextexp; raw_spin_lock(&tmc->lock); ret = READ_ONCE(tmc->wakeup); if (nextexp != KTIME_MAX) { if (nextexp != tmc->cpuevt.nextevt.expires || tmc->cpuevt.ignore) { ret = tmigr_new_timer(tmc, nextexp); /* * Make sure the reevaluation of timers in idle path * will not miss an event. */ WRITE_ONCE(tmc->wakeup, ret); } } trace_tmigr_cpu_new_timer_idle(tmc, nextexp); raw_spin_unlock(&tmc->lock); return ret; } static bool tmigr_inactive_up(struct tmigr_group *group, struct tmigr_group *child, struct tmigr_walk *data) { union tmigr_state curstate, newstate, childstate; bool walk_done; u8 childmask; childmask = data->childmask; childstate.state = 0; /* * The memory barrier is paired with the cmpxchg() in tmigr_active_up() * to make sure the updates of child and group states are ordered. The * ordering is mandatory, as the group state change depends on the child * state. */ curstate.state = atomic_read_acquire(&group->migr_state); for (;;) { if (child) childstate.state = atomic_read(&child->migr_state); newstate = curstate; walk_done = true; /* Reset active bit when the child is no longer active */ if (!childstate.active) newstate.active &= ~childmask; if (newstate.migrator == childmask) { /* * Find a new migrator for the group, because the child * group is idle! */ if (!childstate.active) { unsigned long new_migr_bit, active = newstate.active; new_migr_bit = find_first_bit(&active, BIT_CNT); if (new_migr_bit != BIT_CNT) { newstate.migrator = BIT(new_migr_bit); } else { newstate.migrator = TMIGR_NONE; /* Changes need to be propagated */ walk_done = false; } } } newstate.seq++; WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); break; } /* * The memory barrier is paired with the cmpxchg() in * tmigr_active_up() to make sure the updates of child and group * states are ordered. It is required only when the above * try_cmpxchg() fails. */ smp_mb__after_atomic(); } data->remote = false; /* Event Handling */ tmigr_update_events(group, child, data); return walk_done; } static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) { struct tmigr_walk data = { .nextexp = nextexp, .firstexp = KTIME_MAX, .evt = &tmc->cpuevt, .childmask = tmc->groupmask }; /* * If nextexp is KTIME_MAX, the CPU event will be ignored because the * local timer expires before the global timer, no global timer is set * or CPU goes offline. */ if (nextexp != KTIME_MAX) tmc->cpuevt.ignore = false; walk_groups(&tmigr_inactive_up, &data, tmc); return data.firstexp; } /** * tmigr_cpu_deactivate() - Put current CPU into inactive state * @nextexp: The next global timer expiry of the current CPU * * Must be called with interrupts disabled. * * Return: the next event expiry of the current CPU or the next event expiry * from the hierarchy if this CPU is the top level migrator or the hierarchy is * completely idle. */ u64 tmigr_cpu_deactivate(u64 nextexp) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); u64 ret; if (tmigr_is_not_available(tmc)) return nextexp; raw_spin_lock(&tmc->lock); ret = __tmigr_cpu_deactivate(tmc, nextexp); tmc->idle = true; /* * Make sure the reevaluation of timers in idle path will not miss an * event. */ WRITE_ONCE(tmc->wakeup, ret); trace_tmigr_cpu_idle(tmc, nextexp); raw_spin_unlock(&tmc->lock); return ret; } /** * tmigr_quick_check() - Quick forecast of next tmigr event when CPU wants to * go idle * @nextevt: The next global timer expiry of the current CPU * * Return: * * KTIME_MAX - when it is probable that nothing has to be done (not * the only one in the level 0 group; and if it is the * only one in level 0 group, but there are more than a * single group active on the way to top level) * * nextevt - when CPU is offline and has to handle timer on its own * or when on the way to top in every group only a single * child is active but @nextevt is before the lowest * next_expiry encountered while walking up to top level. * * next_expiry - value of lowest expiry encountered while walking groups * if only a single child is active on each and @nextevt * is after this lowest expiry. */ u64 tmigr_quick_check(u64 nextevt) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_group *group = tmc->tmgroup; if (tmigr_is_not_available(tmc)) return nextevt; if (WARN_ON_ONCE(tmc->idle)) return nextevt; if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask)) return KTIME_MAX; do { if (!tmigr_check_lonely(group)) { return KTIME_MAX; } else { /* * Since current CPU is active, events may not be sorted * from bottom to the top because the CPU's event is ignored * up to the top and its sibling's events not propagated upwards. * Thus keep track of the lowest observed expiry. */ nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); if (!group->parent) return nextevt; } group = group->parent; } while (group); return KTIME_MAX; } /* * tmigr_trigger_active() - trigger a CPU to become active again * * This function is executed on a CPU which is part of cpu_online_mask, when the * last active CPU in the hierarchy is offlining. With this, it is ensured that * the other CPU is active and takes over the migrator duty. */ static long tmigr_trigger_active(void *unused) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); WARN_ON_ONCE(!tmc->online || tmc->idle); return 0; } static int tmigr_cpu_offline(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); int migrator; u64 firstexp; raw_spin_lock_irq(&tmc->lock); tmc->online = false; WRITE_ONCE(tmc->wakeup, KTIME_MAX); /* * CPU has to handle the local events on his own, when on the way to * offline; Therefore nextevt value is set to KTIME_MAX */ firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); trace_tmigr_cpu_offline(tmc); raw_spin_unlock_irq(&tmc->lock); if (firstexp != KTIME_MAX) { migrator = cpumask_any_but(cpu_online_mask, cpu); work_on_cpu(migrator, tmigr_trigger_active, NULL); } return 0; } static int tmigr_cpu_online(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); /* Check whether CPU data was successfully initialized */ if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; raw_spin_lock_irq(&tmc->lock); trace_tmigr_cpu_online(tmc); tmc->idle = timer_base_is_idle(); if (!tmc->idle) __tmigr_cpu_activate(tmc); tmc->online = true; raw_spin_unlock_irq(&tmc->lock); return 0; } static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { union tmigr_state s; raw_spin_lock_init(&group->lock); group->level = lvl; group->numa_node = lvl < tmigr_crossnode_level ? node : NUMA_NO_NODE; group->num_children = 0; s.migrator = TMIGR_NONE; s.active = 0; s.seq = 0; atomic_set(&group->migr_state, s.state); /* * If this is a new top-level, prepare its groupmask in advance. * This avoids accidents where yet another new top-level is * created in the future and made visible before the current groupmask. */ if (list_empty(&tmigr_level_list[lvl])) { group->groupmask = BIT(0); /* * The previous top level has prepared its groupmask already, * simply account it as the first child. */ if (lvl > 0) group->num_children = 1; } timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; WRITE_ONCE(group->next_expiry, KTIME_MAX); group->groupevt.ignore = true; } static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; lockdep_assert_held(&tmigr_mutex); /* Try to attach to an existing group first */ list_for_each_entry(tmp, &tmigr_level_list[lvl], list) { /* * If @lvl is below the cross NUMA node level, check whether * this group belongs to the same NUMA node. */ if (lvl < tmigr_crossnode_level && tmp->numa_node != node) continue; /* Capacity left? */ if (tmp->num_children >= TMIGR_CHILDREN_PER_GROUP) continue; /* * TODO: A possible further improvement: Make sure that all CPU * siblings end up in the same group of the lowest level of the * hierarchy. Rely on the topology sibling mask would be a * reasonable solution. */ group = tmp; break; } if (group) return group; /* Allocate and set up a new group */ group = kzalloc_node(sizeof(*group), GFP_KERNEL, node); if (!group) return ERR_PTR(-ENOMEM); tmigr_init_group(group, lvl, node); /* Setup successful. Add it to the hierarchy */ list_add(&group->list, &tmigr_level_list[lvl]); trace_tmigr_group_set(group); return group; } static void tmigr_connect_child_parent(struct tmigr_group *child, struct tmigr_group *parent, bool activate) { struct tmigr_walk data; raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); if (activate) { /* * @child is the old top and @parent the new one. In this * case groupmask is pre-initialized and @child already * accounted, along with its new sibling corresponding to the * CPU going up. */ WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); } else { /* Adding @child for the CPU going up to @parent. */ child->groupmask = BIT(parent->num_children++); } /* * Make sure parent initialization is visible before publishing it to a * racing CPU entering/exiting idle. This RELEASE barrier enforces an * address dependency that pairs with the READ_ONCE() in __walk_groups(). */ smp_store_release(&child->parent, parent); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); trace_tmigr_connect_child_parent(child); if (!activate) return; /* * To prevent inconsistent states, active children need to be active in * the new parent as well. Inactive children are already marked inactive * in the parent group: * * * When new groups were created by tmigr_setup_groups() starting from * the lowest level (and not higher then one level below the current * top level), then they are not active. They will be set active when * the new online CPU comes active. * * * But if a new group above the current top level is required, it is * mandatory to propagate the active state of the already existing * child to the new parent. So tmigr_connect_child_parent() is * executed with the formerly top level group (child) and the newly * created group (parent). * * * It is ensured that the child is active, as this setup path is * executed in hotplug prepare callback. This is exectued by an * already connected and !idle CPU. Even if all other CPUs go idle, * the CPU executing the setup will be responsible up to current top * level group. And the next time it goes inactive, it will release * the new childmask and parent to subsequent walkers through this * @child. Therefore propagate active state unconditionally. */ data.childmask = child->groupmask; /* * There is only one new level per time (which is protected by * tmigr_mutex). When connecting the child and the parent and set the * child active when the parent is inactive, the parent needs to be the * uppermost level. Otherwise there went something wrong! */ WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } static int tmigr_setup_groups(unsigned int cpu, unsigned int node) { struct tmigr_group *group, *child, **stack; int top = 0, err = 0, i = 0; struct list_head *lvllist; stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); if (!stack) return -ENOMEM; do { group = tmigr_get_group(cpu, node, i); if (IS_ERR(group)) { err = PTR_ERR(group); break; } top = i; stack[i++] = group; /* * When booting only less CPUs of a system than CPUs are * available, not all calculated hierarchy levels are required. * * The loop is aborted as soon as the highest level, which might * be different from tmigr_hierarchy_levels, contains only a * single group. */ if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) break; } while (i < tmigr_hierarchy_levels); /* Assert single root */ WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); while (i > 0) { group = stack[--i]; if (err < 0) { list_del(&group->list); kfree(group); continue; } WARN_ON_ONCE(i != group->level); /* * Update tmc -> group / child -> group connection */ if (i == 0) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); raw_spin_lock_irq(&group->lock); tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); raw_spin_unlock_irq(&group->lock); trace_tmigr_connect_cpu_parent(tmc); /* There are no children that need to be connected */ continue; } else { child = stack[i - 1]; /* Will be activated at online time */ tmigr_connect_child_parent(child, group, false); } /* check if uppermost level was newly created */ if (top != i) continue; WARN_ON_ONCE(top == 0); lvllist = &tmigr_level_list[top]; /* * Newly created root level should have accounted the upcoming * CPU's child group and pre-accounted the old root. */ if (group->num_children == 2 && list_is_singular(lvllist)) { /* * The target CPU must never do the prepare work, except * on early boot when the boot CPU is the target. Otherwise * it may spuriously activate the old top level group inside * the new one (nevertheless whether old top level group is * active or not) and/or release an uninitialized childmask. */ WARN_ON_ONCE(cpu == raw_smp_processor_id()); lvllist = &tmigr_level_list[top - 1]; list_for_each_entry(child, lvllist, list) { if (child->parent) continue; tmigr_connect_child_parent(child, group, true); } } } kfree(stack); return err; } static int tmigr_add_cpu(unsigned int cpu) { int node = cpu_to_node(cpu); int ret; mutex_lock(&tmigr_mutex); ret = tmigr_setup_groups(cpu, node); mutex_unlock(&tmigr_mutex); return ret; } static int tmigr_cpu_prepare(unsigned int cpu) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); int ret = 0; /* Not first online attempt? */ if (tmc->tmgroup) return ret; raw_spin_lock_init(&tmc->lock); timerqueue_init(&tmc->cpuevt.nextevt); tmc->cpuevt.nextevt.expires = KTIME_MAX; tmc->cpuevt.ignore = true; tmc->cpuevt.cpu = cpu; tmc->remote = false; WRITE_ONCE(tmc->wakeup, KTIME_MAX); ret = tmigr_add_cpu(cpu); if (ret < 0) return ret; if (tmc->groupmask == 0) return -EINVAL; return ret; } static int __init tmigr_init(void) { unsigned int cpulvl, nodelvl, cpus_per_node, i; unsigned int nnodes = num_possible_nodes(); unsigned int ncpus = num_possible_cpus(); int ret = -ENOMEM; BUILD_BUG_ON_NOT_POWER_OF_2(TMIGR_CHILDREN_PER_GROUP); /* Nothing to do if running on UP */ if (ncpus == 1) return 0; /* * Calculate the required hierarchy levels. Unfortunately there is no * reliable information available, unless all possible CPUs have been * brought up and all NUMA nodes are populated. * * Estimate the number of levels with the number of possible nodes and * the number of possible CPUs. Assume CPUs are spread evenly across * nodes. We cannot rely on cpumask_of_node() because it only works for * online CPUs. */ cpus_per_node = DIV_ROUND_UP(ncpus, nnodes); /* Calc the hierarchy levels required to hold the CPUs of a node */ cpulvl = DIV_ROUND_UP(order_base_2(cpus_per_node), ilog2(TMIGR_CHILDREN_PER_GROUP)); /* Calculate the extra levels to connect all nodes */ nodelvl = DIV_ROUND_UP(order_base_2(nnodes), ilog2(TMIGR_CHILDREN_PER_GROUP)); tmigr_hierarchy_levels = cpulvl + nodelvl; /* * If a NUMA node spawns more than one CPU level group then the next * level(s) of the hierarchy contains groups which handle all CPU groups * of the same NUMA node. The level above goes across NUMA nodes. Store * this information for the setup code to decide in which level node * matching is no longer required. */ tmigr_crossnode_level = cpulvl; tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL); if (!tmigr_level_list) goto err; for (i = 0; i < tmigr_hierarchy_levels; i++) INIT_LIST_HEAD(&tmigr_level_list[i]); pr_info("Timer migration: %d hierarchy levels; %d children per group;" " %d crossnode level\n", tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, tmigr_crossnode_level); ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", tmigr_cpu_prepare, NULL); if (ret) goto err; ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", tmigr_cpu_online, tmigr_cpu_offline); if (ret) goto err; return 0; err: pr_err("Timer migration setup failed\n"); return ret; } early_initcall(tmigr_init); |
| 21 4 4 21 21 21 21 21 4 4 4 21 21 21 4 2 4 4 4 4 4 37 37 37 37 11 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 | // SPDX-License-Identifier: GPL-2.0-only /* * BSS client mode implementation * Copyright 2003-2008, Jouni Malinen <j@w1.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018 - 2025 Intel Corporation */ #include <linux/delay.h> #include <linux/fips.h> #include <linux/if_ether.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/moduleparam.h> #include <linux/rtnetlink.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/export.h> #include <net/mac80211.h> #include <linux/unaligned.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "led.h" #include "fils_aead.h" #include <kunit/static_stub.h> #define IEEE80211_AUTH_TIMEOUT (HZ / 5) #define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2) #define IEEE80211_AUTH_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_AUTH_TIMEOUT_SAE (HZ * 2) #define IEEE80211_AUTH_MAX_TRIES 3 #define IEEE80211_AUTH_WAIT_ASSOC (HZ * 5) #define IEEE80211_AUTH_WAIT_SAE_RETRY (HZ * 2) #define IEEE80211_ASSOC_TIMEOUT (HZ / 5) #define IEEE80211_ASSOC_TIMEOUT_LONG (HZ / 2) #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_ASSOC_MAX_TRIES 3 #define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS msecs_to_jiffies(100) #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00 #define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5) static int max_nullfunc_tries = 2; module_param(max_nullfunc_tries, int, 0644); MODULE_PARM_DESC(max_nullfunc_tries, "Maximum nullfunc tx tries before disconnecting (reason 4)."); static int max_probe_tries = 5; module_param(max_probe_tries, int, 0644); MODULE_PARM_DESC(max_probe_tries, "Maximum probe tries before disconnecting (reason 4)."); /* * Beacon loss timeout is calculated as N frames times the * advertised beacon interval. This may need to be somewhat * higher than what hardware might detect to account for * delays in the host processing frames. But since we also * probe on beacon miss before declaring the connection lost * default to what we want. */ static int beacon_loss_count = 7; module_param(beacon_loss_count, int, 0644); MODULE_PARM_DESC(beacon_loss_count, "Number of beacon intervals before we decide beacon was lost."); /* * Time the connection can be idle before we probe * it to see if we can still talk to the AP. */ #define IEEE80211_CONNECTION_IDLE_TIME (30 * HZ) /* * Time we wait for a probe response after sending * a probe request because of beacon loss or for * checking the connection still works. */ static int probe_wait_ms = 500; module_param(probe_wait_ms, int, 0644); MODULE_PARM_DESC(probe_wait_ms, "Maximum time(ms) to wait for probe response" " before disconnecting (reason 4)."); /* * How many Beacon frames need to have been used in average signal strength * before starting to indicate signal change events. */ #define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 /* * We can have multiple work items (and connection probing) * scheduling this timer, but we need to take care to only * reschedule it when it should fire _earlier_ than it was * asked for before, or if it's not pending right now. This * function ensures that. Note that it then is required to * run this function for all timeouts after the first one * has happened -- the work that runs from this timer will * do that. */ static void run_again(struct ieee80211_sub_if_data *sdata, unsigned long timeout) { lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!timer_pending(&sdata->u.mgd.timer) || time_before(timeout, sdata->u.mgd.timer.expires)) mod_timer(&sdata->u.mgd.timer, timeout); } void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&sdata->u.mgd.bcn_mon_timer, round_jiffies_up(jiffies + sdata->u.mgd.beacon_timeout)); } void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (unlikely(!ifmgd->associated)) return; if (ifmgd->probe_send_count) ifmgd->probe_send_count = 0; if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); } static int ecw2cw(int ecw) { return (1 << ecw) - 1; } static enum ieee80211_conn_mode ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 vht_cap_info, const struct ieee802_11_elems *elems, bool ignore_ht_channel_mismatch, const struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *chandef) { const struct ieee80211_ht_operation *ht_oper = elems->ht_operation; const struct ieee80211_vht_operation *vht_oper = elems->vht_operation; const struct ieee80211_he_operation *he_oper = elems->he_operation; const struct ieee80211_eht_operation *eht_oper = elems->eht_operation; struct ieee80211_supported_band *sband = sdata->local->hw.wiphy->bands[channel->band]; struct cfg80211_chan_def vht_chandef; bool no_vht = false; u32 ht_cfreq; if (ieee80211_hw_check(&sdata->local->hw, STRICT)) ignore_ht_channel_mismatch = false; *chandef = (struct cfg80211_chan_def) { .chan = channel, .width = NL80211_CHAN_WIDTH_20_NOHT, .center_freq1 = channel->center_freq, .freq1_offset = channel->freq_offset, }; /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) { sdata_info(sdata, "Missing S1G Operation Element? Trying operating == primary\n"); chandef->width = ieee80211_s1g_channel_width(channel); } return IEEE80211_CONN_MODE_S1G; } /* get special 6 GHz case out of the way */ if (sband->band == NL80211_BAND_6GHZ) { enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_EHT; /* this is an error */ if (conn->mode < IEEE80211_CONN_MODE_HE) return IEEE80211_CONN_MODE_LEGACY; if (!elems->he_6ghz_capa || !elems->he_cap) { sdata_info(sdata, "HE 6 GHz AP is missing HE/HE 6 GHz band capability\n"); return IEEE80211_CONN_MODE_LEGACY; } if (!eht_oper || !elems->eht_cap) { eht_oper = NULL; mode = IEEE80211_CONN_MODE_HE; } if (!ieee80211_chandef_he_6ghz_oper(sdata->local, he_oper, eht_oper, chandef)) { sdata_info(sdata, "bad HE/EHT 6 GHz operation\n"); return IEEE80211_CONN_MODE_LEGACY; } return mode; } /* now we have the progression HT, VHT, ... */ if (conn->mode < IEEE80211_CONN_MODE_HT) return IEEE80211_CONN_MODE_LEGACY; if (!ht_oper || !elems->ht_cap_elem) return IEEE80211_CONN_MODE_LEGACY; chandef->width = NL80211_CHAN_WIDTH_20; ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ if (!ignore_ht_channel_mismatch && channel->center_freq != ht_cfreq) { /* * It's possible that some APs are confused here; * Netgear WNDR3700 sometimes reports 4 higher than * the actual channel in association responses, but * since we look at probe response/beacon data here * it should be OK. */ sdata_info(sdata, "Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n", channel->center_freq, ht_cfreq, ht_oper->primary_chan, channel->band); return IEEE80211_CONN_MODE_LEGACY; } ieee80211_chandef_ht_oper(ht_oper, chandef); if (conn->mode < IEEE80211_CONN_MODE_VHT) return IEEE80211_CONN_MODE_HT; vht_chandef = *chandef; /* * having he_cap/he_oper parsed out implies we're at * least operating as HE STA */ if (elems->he_cap && he_oper && he_oper->he_oper_params & cpu_to_le32(IEEE80211_HE_OPERATION_VHT_OPER_INFO)) { struct ieee80211_vht_operation he_oper_vht_cap; /* * Set only first 3 bytes (other 2 aren't used in * ieee80211_chandef_vht_oper() anyway) */ memcpy(&he_oper_vht_cap, he_oper->optional, 3); he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &he_oper_vht_cap, ht_oper, &vht_chandef)) { sdata_info(sdata, "HE AP VHT information is invalid, disabling HE\n"); /* this will cause us to re-parse as VHT STA */ return IEEE80211_CONN_MODE_VHT; } } else if (!vht_oper || !elems->vht_cap_elem) { if (sband->band == NL80211_BAND_5GHZ) { sdata_info(sdata, "VHT information is missing, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } no_vht = true; } else if (sband->band == NL80211_BAND_2GHZ) { no_vht = true; } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, vht_oper, ht_oper, &vht_chandef)) { sdata_info(sdata, "AP VHT information is invalid, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } if (!cfg80211_chandef_compatible(chandef, &vht_chandef)) { sdata_info(sdata, "AP VHT information doesn't match HT, disabling VHT\n"); return IEEE80211_CONN_MODE_HT; } *chandef = vht_chandef; /* stick to current max mode if we or the AP don't have HE */ if (conn->mode < IEEE80211_CONN_MODE_HE || !elems->he_operation || !elems->he_cap) { if (no_vht) return IEEE80211_CONN_MODE_HT; return IEEE80211_CONN_MODE_VHT; } /* stick to HE if we or the AP don't have EHT */ if (conn->mode < IEEE80211_CONN_MODE_EHT || !eht_oper || !elems->eht_cap) return IEEE80211_CONN_MODE_HE; /* * handle the case that the EHT operation indicates that it holds EHT * operation information (in case that the channel width differs from * the channel width reported in HT/VHT/HE). */ if (eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT) { struct cfg80211_chan_def eht_chandef = *chandef; ieee80211_chandef_eht_oper((const void *)eht_oper->optional, &eht_chandef); eht_chandef.punctured = ieee80211_eht_oper_dis_subchan_bitmap(eht_oper); if (!cfg80211_chandef_valid(&eht_chandef)) { sdata_info(sdata, "AP EHT information is invalid, disabling EHT\n"); return IEEE80211_CONN_MODE_HE; } if (!cfg80211_chandef_compatible(chandef, &eht_chandef)) { sdata_info(sdata, "AP EHT information doesn't match HT/VHT/HE, disabling EHT\n"); return IEEE80211_CONN_MODE_HE; } *chandef = eht_chandef; } return IEEE80211_CONN_MODE_EHT; } static bool ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_ht_operation *ht_op) { struct ieee80211_sta_ht_cap sta_ht_cap; int i; if (sband->band == NL80211_BAND_6GHZ) return true; if (!ht_op) return false; memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... * If the MLME of an HT STA receives an MLME-JOIN.request primitive * with the SelectedBSS parameter containing a Basic HT-MCS Set field * in the HT Operation parameter that contains any unsupported MCSs, * the MLME response in the resulting MLME-JOIN.confirm primitive shall * contain a ResultCode parameter that is not set to the value SUCCESS. * ... */ /* Simply check that all basic rates are in the STA RX mask */ for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { if ((ht_op->basic_set[i] & sta_ht_cap.mcs.rx_mask[i]) != ht_op->basic_set[i]) return false; } return true; } static bool ieee80211_verify_sta_vht_mcs_support(struct ieee80211_sub_if_data *sdata, int link_id, struct ieee80211_supported_band *sband, const struct ieee80211_vht_operation *vht_op) { struct ieee80211_sta_vht_cap sta_vht_cap; u16 ap_min_req_set, sta_rx_mcs_map, sta_tx_mcs_map; int nss; if (sband->band != NL80211_BAND_5GHZ) return true; if (!vht_op) return false; memcpy(&sta_vht_cap, &sband->vht_cap, sizeof(sta_vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &sta_vht_cap); ap_min_req_set = le16_to_cpu(vht_op->basic_mcs_set); sta_rx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.rx_mcs_map); sta_tx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.tx_mcs_map); /* * Many APs are incorrectly advertising an all-zero value here, * which really means MCS 0-7 are required for 1-8 streams, but * they don't really mean it that way. * Some other APs are incorrectly advertising 3 spatial streams * with MCS 0-7 are required, but don't really mean it that way * and we'll connect only with HT, rather than even HE. * As a result, unfortunately the VHT basic MCS/NSS set cannot * be used at all, so check it only in strict mode. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT)) return true; /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... * If the MLME of a VHT STA receives an MLME-JOIN.request primitive * with a SelectedBSS parameter containing a Basic VHT-MCS And NSS Set * field in the VHT Operation parameter that contains any unsupported * <VHT-MCS, NSS> tuple, the MLME response in the resulting * MLME-JOIN.confirm primitive shall contain a ResultCode parameter * that is not set to the value SUCCESS. * ... */ for (nss = 8; nss > 0; nss--) { u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; u8 sta_rx_val; u8 sta_tx_val; if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; sta_rx_val = (sta_rx_mcs_map >> (2 * (nss - 1))) & 3; sta_tx_val = (sta_tx_mcs_map >> (2 * (nss - 1))) & 3; if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_rx_val < ap_op_val || sta_tx_val < ap_op_val) { link_id_info(sdata, link_id, "Missing mandatory rates for %d Nss, rx %d, tx %d oper %d, disable VHT\n", nss, sta_rx_val, sta_tx_val, ap_op_val); return false; } } return true; } static bool ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, int link_id, const struct ieee80211_he_cap_elem *he_cap, const struct ieee80211_he_operation *he_op) { struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; u16 mcs_80_map_tx, mcs_80_map_rx; u16 ap_min_req_set; int nss; if (!he_cap) return false; /* mcs_nss is right after he_cap info */ he_mcs_nss_supp = (void *)(he_cap + 1); mcs_80_map_tx = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); mcs_80_map_rx = le16_to_cpu(he_mcs_nss_supp->rx_mcs_80); /* P802.11-REVme/D0.3 * 27.1.1 Introduction to the HE PHY * ... * An HE STA shall support the following features: * ... * Single spatial stream HE-MCSs 0 to 7 (transmit and receive) in all * supported channel widths for HE SU PPDUs */ if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED || (mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) { link_id_info(sdata, link_id, "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n", mcs_80_map_tx, mcs_80_map_rx); return false; } if (!he_op) return true; ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); /* * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* make sure the AP is consistent with itself * * P802.11-REVme/D0.3 * 26.17.1 Basic HE BSS operation * * A STA that is operating in an HE BSS shall be able to receive and * transmit at each of the <HE-MCS, NSS> tuple values indicated by the * Basic HE-MCS And NSS Set field of the HE Operation parameter of the * MLME-START.request primitive and shall be able to receive at each of * the <HE-MCS, NSS> tuple values indicated by the Supported HE-MCS and * NSS Set field in the HE Capabilities parameter of the MLMESTART.request * primitive */ for (nss = 8; nss > 0; nss--) { u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; u8 ap_rx_val; u8 ap_tx_val; if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; ap_rx_val = (mcs_80_map_rx >> (2 * (nss - 1))) & 3; ap_tx_val = (mcs_80_map_tx >> (2 * (nss - 1))) & 3; if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) { link_id_info(sdata, link_id, "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n", nss, ap_rx_val, ap_tx_val, ap_op_val); return false; } } return true; } static bool ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_he_operation *he_op) { const struct ieee80211_sta_he_cap *sta_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); u16 ap_min_req_set; int i; if (!sta_he_cap || !he_op) return false; ap_min_req_set = le16_to_cpu(he_op->he_mcs_nss_set); /* * Apparently iPhone 13 (at least iOS version 15.3.1) sets this to all * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* Need to go over for 80MHz, 160MHz and for 80+80 */ for (i = 0; i < 3; i++) { const struct ieee80211_he_mcs_nss_supp *sta_mcs_nss_supp = &sta_he_cap->he_mcs_nss_supp; u16 sta_mcs_map_rx = le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i]); u16 sta_mcs_map_tx = le16_to_cpu(((__le16 *)sta_mcs_nss_supp)[2 * i + 1]); u8 nss; bool verified = true; /* * For each band there is a maximum of 8 spatial streams * possible. Each of the sta_mcs_map_* is a 16-bit struct built * of 2 bits per NSS (1-8), with the values defined in enum * ieee80211_he_mcs_support. Need to make sure STA TX and RX * capabilities aren't less than the AP's minimum requirements * for this HE BSS per SS. * It is enough to find one such band that meets the reqs. */ for (nss = 8; nss > 0; nss--) { u8 sta_rx_val = (sta_mcs_map_rx >> (2 * (nss - 1))) & 3; u8 sta_tx_val = (sta_mcs_map_tx >> (2 * (nss - 1))) & 3; u8 ap_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; if (ap_val == IEEE80211_HE_MCS_NOT_SUPPORTED) continue; /* * Make sure the HE AP doesn't require MCSs that aren't * supported by the client as required by spec * * P802.11-REVme/D0.3 * 26.17.1 Basic HE BSS operation * * An HE STA shall not attempt to join * (MLME-JOIN.request primitive) * a BSS, unless it supports (i.e., is able to both transmit and * receive using) all of the <HE-MCS, NSS> tuples in the basic * HE-MCS and NSS set. */ if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || (ap_val > sta_rx_val) || (ap_val > sta_tx_val)) { verified = false; break; } } if (verified) return true; } /* If here, STA doesn't meet AP's HE min requirements */ return false; } static u8 ieee80211_get_eht_cap_mcs_nss(const struct ieee80211_sta_he_cap *sta_he_cap, const struct ieee80211_sta_eht_cap *sta_eht_cap, unsigned int idx, int bw) { u8 he_phy_cap0 = sta_he_cap->he_cap_elem.phy_cap_info[0]; u8 eht_phy_cap0 = sta_eht_cap->eht_cap_elem.phy_cap_info[0]; /* handle us being a 20 MHz-only EHT STA - with four values * for MCS 0-7, 8-9, 10-11, 12-13. */ if (!(he_phy_cap0 & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL)) return sta_eht_cap->eht_mcs_nss_supp.only_20mhz.rx_tx_max_nss[idx]; /* the others have MCS 0-9 together, rather than separately from 0-7 */ if (idx > 0) idx--; switch (bw) { case 0: return sta_eht_cap->eht_mcs_nss_supp.bw._80.rx_tx_max_nss[idx]; case 1: if (!(he_phy_cap0 & (IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G))) return 0xff; /* pass check */ return sta_eht_cap->eht_mcs_nss_supp.bw._160.rx_tx_max_nss[idx]; case 2: if (!(eht_phy_cap0 & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ)) return 0xff; /* pass check */ return sta_eht_cap->eht_mcs_nss_supp.bw._320.rx_tx_max_nss[idx]; } WARN_ON(1); return 0; } static bool ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_eht_operation *eht_op) { const struct ieee80211_sta_he_cap *sta_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_sta_eht_cap *sta_eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); const struct ieee80211_eht_mcs_nss_supp_20mhz_only *req; unsigned int i; if (!sta_he_cap || !sta_eht_cap || !eht_op) return false; req = &eht_op->basic_mcs_nss; for (i = 0; i < ARRAY_SIZE(req->rx_tx_max_nss); i++) { u8 req_rx_nss, req_tx_nss; unsigned int bw; req_rx_nss = u8_get_bits(req->rx_tx_max_nss[i], IEEE80211_EHT_MCS_NSS_RX); req_tx_nss = u8_get_bits(req->rx_tx_max_nss[i], IEEE80211_EHT_MCS_NSS_TX); for (bw = 0; bw < 3; bw++) { u8 have, have_rx_nss, have_tx_nss; have = ieee80211_get_eht_cap_mcs_nss(sta_he_cap, sta_eht_cap, i, bw); have_rx_nss = u8_get_bits(have, IEEE80211_EHT_MCS_NSS_RX); have_tx_nss = u8_get_bits(have, IEEE80211_EHT_MCS_NSS_TX); if (req_rx_nss > have_rx_nss || req_tx_nss > have_tx_nss) return false; } } return true; } static void ieee80211_get_rates(struct ieee80211_supported_band *sband, const u8 *supp_rates, unsigned int supp_rates_len, const u8 *ext_supp_rates, unsigned int ext_supp_rates_len, u32 *rates, u32 *basic_rates, unsigned long *unknown_rates_selectors, bool *have_higher_than_11mbit, int *min_rate, int *min_rate_index) { int i, j; for (i = 0; i < supp_rates_len + ext_supp_rates_len; i++) { u8 supp_rate = i < supp_rates_len ? supp_rates[i] : ext_supp_rates[i - supp_rates_len]; int rate = supp_rate & 0x7f; bool is_basic = !!(supp_rate & 0x80); if ((rate * 5) > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; /* * Skip membership selectors since they're not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly * the same. */ if (is_basic && rate >= BSS_MEMBERSHIP_SELECTOR_MIN) { if (unknown_rates_selectors) set_bit(rate, unknown_rates_selectors); continue; } for (j = 0; j < sband->n_bitrates; j++) { struct ieee80211_rate *br; int brate; br = &sband->bitrates[j]; brate = DIV_ROUND_UP(br->bitrate, 5); if (brate == rate) { if (rates) *rates |= BIT(j); if (is_basic && basic_rates) *basic_rates |= BIT(j); if (min_rate && (rate * 5) < *min_rate) { *min_rate = rate * 5; if (min_rate_index) *min_rate_index = j; } break; } } /* Handle an unknown entry as if it is an unknown selector */ if (is_basic && unknown_rates_selectors && j == sband->n_bitrates) set_bit(rate, unknown_rates_selectors); } } static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 prohibited_flags) { if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef, prohibited_flags)) return false; if (chandef->punctured && ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING)) return false; if (chandef->punctured && chandef->chan->band == NL80211_BAND_5GHZ && ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING_5GHZ)) return false; return true; } static int ieee80211_chandef_num_subchans(const struct cfg80211_chan_def *c) { if (c->width == NL80211_CHAN_WIDTH_80P80) return 4 + 4; return cfg80211_chandef_get_width(c) / 20; } static int ieee80211_chandef_num_widths(const struct cfg80211_chan_def *c) { switch (c->width) { case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: return 1; case NL80211_CHAN_WIDTH_40: return 2; case NL80211_CHAN_WIDTH_80P80: case NL80211_CHAN_WIDTH_80: return 3; case NL80211_CHAN_WIDTH_160: return 4; case NL80211_CHAN_WIDTH_320: return 5; default: WARN_ON(1); return 0; } } VISIBLE_IF_MAC80211_KUNIT int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, u8 n_partial_subchans) { int n = ieee80211_chandef_num_subchans(ap); struct cfg80211_chan_def tmp = *ap; int offset = 0; /* * Given a chandef (in this context, it's the AP's) and a number * of subchannels that we want to look at ('n_partial_subchans'), * calculate the offset in number of subchannels between the full * and the subset with the desired width. */ /* same number of subchannels means no offset, obviously */ if (n == n_partial_subchans) return 0; /* don't WARN - misconfigured APs could cause this if their N > width */ if (n < n_partial_subchans) return 0; while (ieee80211_chandef_num_subchans(&tmp) > n_partial_subchans) { u32 prev = tmp.center_freq1; ieee80211_chandef_downgrade(&tmp, NULL); /* * if center_freq moved up, half the original channels * are gone now but were below, so increase offset */ if (prev < tmp.center_freq1) offset += ieee80211_chandef_num_subchans(&tmp); } /* * 80+80 with secondary 80 below primary - four subchannels for it * (we cannot downgrade *to* 80+80, so no need to consider 'tmp') */ if (ap->width == NL80211_CHAN_WIDTH_80P80 && ap->center_freq2 < ap->center_freq1) offset += 4; return offset; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_calc_chandef_subchan_offset); VISIBLE_IF_MAC80211_KUNIT void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used) { u8 needed = ieee80211_chandef_num_subchans(used); u8 have = ieee80211_chandef_num_subchans(ap); u8 tmp[IEEE80211_TPE_PSD_ENTRIES_320MHZ]; u8 offset; if (!psd->valid) return; /* if N is zero, all defaults were used, no point in rearranging */ if (!psd->n) goto out; BUILD_BUG_ON(sizeof(tmp) != sizeof(psd->power)); /* * This assumes that 'N' is consistent with the HE channel, as * it should be (otherwise the AP is broken). * * In psd->power we have values in the order 0..N, 0..K, where * N+K should cover the entire channel per 'ap', but even if it * doesn't then we've pre-filled 'unlimited' as defaults. * * But this is all the wrong order, we want to have them in the * order of the 'used' channel. * * So for example, we could have a 320 MHz EHT AP, which has the * HE channel as 80 MHz (e.g. due to puncturing, which doesn't * seem to be considered for the TPE), as follows: * * EHT 320: | | | | | | | | | | | | | | | | | * HE 80: | | | | | * used 160: | | | | | | | | | * * N entries: |--|--|--|--| * K entries: |--|--|--|--|--|--|--|--| |--|--|--|--| * power idx: 4 5 6 7 8 9 10 11 0 1 2 3 12 13 14 15 * full chan: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 * used chan: 0 1 2 3 4 5 6 7 * * The idx in the power array ('power idx') is like this since it * comes directly from the element's N and K entries in their * element order, and those are this way for HE compatibility. * * Rearrange them as desired here, first by putting them into the * 'full chan' order, and then selecting the necessary subset for * the 'used chan'. */ /* first reorder according to AP channel */ offset = ieee80211_calc_chandef_subchan_offset(ap, psd->n); for (int i = 0; i < have; i++) { if (i < offset) tmp[i] = psd->power[i + psd->n]; else if (i < offset + psd->n) tmp[i] = psd->power[i - offset]; else tmp[i] = psd->power[i]; } /* * and then select the subset for the used channel * (set everything to defaults first in case a driver is confused) */ memset(psd->power, IEEE80211_TPE_PSD_NO_LIMIT, sizeof(psd->power)); offset = ieee80211_calc_chandef_subchan_offset(ap, needed); for (int i = 0; i < needed; i++) psd->power[i] = tmp[offset + i]; out: /* limit, but don't lie if there are defaults in the data */ if (needed < psd->count) psd->count = needed; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_rearrange_tpe_psd); static void ieee80211_rearrange_tpe(struct ieee80211_parsed_tpe *tpe, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used) { /* ignore this completely for narrow/invalid channels */ if (!ieee80211_chandef_num_subchans(ap) || !ieee80211_chandef_num_subchans(used)) { ieee80211_clear_tpe(tpe); return; } for (int i = 0; i < 2; i++) { int needed_pwr_count; ieee80211_rearrange_tpe_psd(&tpe->psd_local[i], ap, used); ieee80211_rearrange_tpe_psd(&tpe->psd_reg_client[i], ap, used); /* limit this to the widths we actually need */ needed_pwr_count = ieee80211_chandef_num_widths(used); if (needed_pwr_count < tpe->max_local[i].count) tpe->max_local[i].count = needed_pwr_count; if (needed_pwr_count < tpe->max_reg_client[i].count) tpe->max_reg_client[i].count = needed_pwr_count; } } /* * The AP part of the channel request is used to distinguish settings * to the device used for wider bandwidth OFDMA. This is used in the * channel context code to assign two channel contexts even if they're * both for the same channel, if the AP bandwidths are incompatible. * If not EHT (or driver override) then ap.chan == NULL indicates that * there's no wider BW OFDMA used. */ static void ieee80211_set_chanreq_ap(struct ieee80211_sub_if_data *sdata, struct ieee80211_chan_req *chanreq, struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *ap_chandef) { chanreq->ap.chan = NULL; if (conn->mode < IEEE80211_CONN_MODE_EHT) return; if (sdata->vif.driver_flags & IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW) return; chanreq->ap = *ap_chandef; } VISIBLE_IF_MAC80211_KUNIT struct ieee802_11_elems * ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_bss *cbss, int link_id, struct ieee80211_chan_req *chanreq, struct cfg80211_chan_def *ap_chandef, unsigned long *userspace_selectors) { const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies); struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_channel *channel = cbss->channel; struct ieee80211_elems_parse_params parse_params = { .link_id = -1, .from_ap = true, .start = ies->data, .len = ies->len, }; struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; enum ieee80211_conn_mode ap_mode; unsigned long unknown_rates_selectors[BITS_TO_LONGS(128)] = {}; unsigned long sta_selectors[BITS_TO_LONGS(128)] = {}; int ret; again: parse_params.mode = conn->mode; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return ERR_PTR(-ENOMEM); ap_mode = ieee80211_determine_ap_chan(sdata, channel, bss->vht_cap_info, elems, false, conn, ap_chandef); /* this should be impossible since parsing depends on our mode */ if (WARN_ON(ap_mode > conn->mode)) { ret = -EINVAL; goto free; } if (conn->mode != ap_mode) { conn->mode = ap_mode; kfree(elems); goto again; } mlme_link_id_dbg(sdata, link_id, "determined AP %pM to be %s\n", cbss->bssid, ieee80211_conn_mode_str(ap_mode)); sband = sdata->local->hw.wiphy->bands[channel->band]; ieee80211_get_rates(sband, elems->supp_rates, elems->supp_rates_len, elems->ext_supp_rates, elems->ext_supp_rates_len, NULL, NULL, unknown_rates_selectors, NULL, NULL, NULL); switch (channel->band) { case NL80211_BAND_S1GHZ: if (WARN_ON(ap_mode != IEEE80211_CONN_MODE_S1G)) { ret = -EINVAL; goto free; } return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { link_id_info(sdata, link_id, "Rejecting non-HE 6/7 GHz connection"); ret = -EINVAL; goto free; } break; default: if (WARN_ON(ap_mode == IEEE80211_CONN_MODE_S1G)) { ret = -EINVAL; goto free; } } switch (ap_mode) { case IEEE80211_CONN_MODE_S1G: WARN_ON(1); ret = -EINVAL; goto free; case IEEE80211_CONN_MODE_LEGACY: conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; break; case IEEE80211_CONN_MODE_HT: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_40); break; case IEEE80211_CONN_MODE_VHT: case IEEE80211_CONN_MODE_HE: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); break; case IEEE80211_CONN_MODE_EHT: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_320); break; } chanreq->oper = *ap_chandef; bitmap_copy(sta_selectors, userspace_selectors, 128); if (conn->mode >= IEEE80211_CONN_MODE_HT) set_bit(BSS_MEMBERSHIP_SELECTOR_HT_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_VHT) set_bit(BSS_MEMBERSHIP_SELECTOR_VHT_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_HE) set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_EHT) set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors); /* * We do not support EPD or GLK so never add them. * SAE_H2E is handled through userspace_selectors. */ /* Check if we support all required features */ if (!bitmap_subset(unknown_rates_selectors, sta_selectors, 128)) { link_id_info(sdata, link_id, "required basic rate or BSS membership selectors not supported or disabled, rejecting connection\n"); ret = -EINVAL; goto free; } ieee80211_set_chanreq_ap(sdata, chanreq, conn, ap_chandef); while (!ieee80211_chandef_usable(sdata, &chanreq->oper, IEEE80211_CHAN_DISABLED)) { if (WARN_ON(chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT)) { ret = -EINVAL; goto free; } ieee80211_chanreq_downgrade(chanreq, conn); } if (conn->mode >= IEEE80211_CONN_MODE_HE && !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_NO_HE)) { conn->mode = IEEE80211_CONN_MODE_VHT; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, IEEE80211_CHAN_NO_EHT)) { conn->mode = IEEE80211_CONN_MODE_HE; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); } if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode) link_id_info(sdata, link_id, "regulatory prevented using AP config, downgraded\n"); if (conn->mode >= IEEE80211_CONN_MODE_HT && !ieee80211_verify_sta_ht_mcs_support(sdata, sband, elems->ht_operation)) { conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; link_id_info(sdata, link_id, "required MCSes not supported, disabling HT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_VHT && !ieee80211_verify_sta_vht_mcs_support(sdata, link_id, sband, elems->vht_operation)) { conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_40); link_id_info(sdata, link_id, "required MCSes not supported, disabling VHT\n"); } if (conn->mode >= IEEE80211_CONN_MODE_HE && (!ieee80211_verify_peer_he_mcs_support(sdata, link_id, (void *)elems->he_cap, elems->he_operation) || !ieee80211_verify_sta_he_mcs_support(sdata, sband, elems->he_operation))) { conn->mode = IEEE80211_CONN_MODE_VHT; link_id_info(sdata, link_id, "required MCSes not supported, disabling HE\n"); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && !ieee80211_verify_sta_eht_mcs_support(sdata, sband, elems->eht_operation)) { conn->mode = IEEE80211_CONN_MODE_HE; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); link_id_info(sdata, link_id, "required MCSes not supported, disabling EHT\n"); } /* the mode can only decrease, so this must terminate */ if (ap_mode != conn->mode) { kfree(elems); goto again; } mlme_link_id_dbg(sdata, link_id, "connecting with %s mode, max bandwidth %d MHz\n", ieee80211_conn_mode_str(conn->mode), 20 * (1 << conn->bw_limit)); if (WARN_ON_ONCE(!cfg80211_chandef_valid(&chanreq->oper))) { ret = -EINVAL; goto free; } return elems; free: kfree(elems); return ERR_PTR(ret); } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_determine_chan_mode); static int ieee80211_config_bw(struct ieee80211_link_data *link, struct ieee802_11_elems *elems, bool update, u64 *changed, const char *frame) { struct ieee80211_channel *channel = link->conf->chanreq.oper.chan; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef; enum ieee80211_conn_mode ap_mode; u32 vht_cap_info = 0; u16 ht_opmode; int ret; /* don't track any bandwidth changes in legacy/S1G modes */ if (link->u.mgd.conn.mode == IEEE80211_CONN_MODE_LEGACY || link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G) return 0; if (elems->vht_cap_elem) vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); ap_mode = ieee80211_determine_ap_chan(sdata, channel, vht_cap_info, elems, true, &link->u.mgd.conn, &ap_chandef); if (ap_mode != link->u.mgd.conn.mode) { link_info(link, "AP %pM appears to change mode (expected %s, found %s) in %s, disconnect\n", link->u.mgd.bssid, ieee80211_conn_mode_str(link->u.mgd.conn.mode), ieee80211_conn_mode_str(ap_mode), frame); return -EINVAL; } chanreq.oper = ap_chandef; ieee80211_set_chanreq_ap(sdata, &chanreq, &link->u.mgd.conn, &ap_chandef); /* * if HT operation mode changed store the new one - * this may be applicable even if channel is identical */ if (elems->ht_operation) { ht_opmode = le16_to_cpu(elems->ht_operation->operation_mode); if (link->conf->ht_operation_mode != ht_opmode) { *changed |= BSS_CHANGED_HT; link->conf->ht_operation_mode = ht_opmode; } } /* * Downgrade the new channel if we associated with restricted * bandwidth capabilities. For example, if we associated as a * 20 MHz STA to a 40 MHz AP (due to regulatory, capabilities * or config reasons) then switching to a 40 MHz channel now * won't do us any good -- we couldn't use it with the AP. */ while (link->u.mgd.conn.bw_limit < ieee80211_min_bw_limit_from_chandef(&chanreq.oper)) ieee80211_chandef_downgrade(&chanreq.oper, NULL); if (ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_rearrange_tpe(&elems->tpe, &ap_chandef, &chanreq.oper); if (memcmp(&link->conf->tpe, &elems->tpe, sizeof(elems->tpe))) { link->conf->tpe = elems->tpe; *changed |= BSS_CHANGED_TPE; } } if (ieee80211_chanreq_identical(&chanreq, &link->conf->chanreq)) return 0; link_info(link, "AP %pM changed bandwidth in %s, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n", link->u.mgd.bssid, frame, chanreq.oper.chan->center_freq, chanreq.oper.chan->freq_offset, chanreq.oper.width, chanreq.oper.center_freq1, chanreq.oper.freq1_offset, chanreq.oper.center_freq2); if (!cfg80211_chandef_valid(&chanreq.oper)) { sdata_info(sdata, "AP %pM changed caps/bw in %s in a way we can't support - disconnect\n", link->u.mgd.bssid, frame); return -EINVAL; } if (!update) { link->conf->chanreq = chanreq; return 0; } /* * We're tracking the current AP here, so don't do any further checks * here. This keeps us from playing ping-pong with regulatory, without * it the following can happen (for example): * - connect to an AP with 80 MHz, world regdom allows 80 MHz * - AP advertises regdom US * - CRDA loads regdom US with 80 MHz prohibited (old database) * - we detect an unsupported channel and disconnect * - disconnect causes CRDA to reload world regdomain and the game * starts anew. * (see https://bugzilla.kernel.org/show_bug.cgi?id=70881) * * It seems possible that there are still scenarios with CSA or real * bandwidth changes where a this could happen, but those cases are * less common and wouldn't completely prevent using the AP. */ ret = ieee80211_link_change_chanreq(link, &chanreq, changed); if (ret) { sdata_info(sdata, "AP %pM changed bandwidth in %s to incompatible one - disconnect\n", link->u.mgd.bssid, frame); return ret; } cfg80211_schedule_channels_check(&sdata->wdev); return 0; } /* frame sending functions */ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ap_ht_param, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, enum ieee80211_smps_mode smps, const struct ieee80211_conn_settings *conn) { u8 *pos; u32 flags = channel->flags; u16 cap; struct ieee80211_sta_ht_cap ht_cap; BUILD_BUG_ON(sizeof(ht_cap) != sizeof(sband->ht_cap)); memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); ieee80211_apply_htcap_overrides(sdata, &ht_cap); /* determine capability flags */ cap = ht_cap.cap; switch (ap_ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) { case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: if (flags & IEEE80211_CHAN_NO_HT40PLUS) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: if (flags & IEEE80211_CHAN_NO_HT40MINUS) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } break; } /* * If 40 MHz was disabled associate as though we weren't * capable of 40 MHz -- some broken APs will never fall * back to trying to transmit in 20 MHz. */ if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_20) { cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; cap &= ~IEEE80211_HT_CAP_SGI_40; } /* set SM PS mode properly */ cap &= ~IEEE80211_HT_CAP_SM_PS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF: cap |= WLAN_HT_CAP_SM_PS_DISABLED << IEEE80211_HT_CAP_SM_PS_SHIFT; break; case IEEE80211_SMPS_STATIC: cap |= WLAN_HT_CAP_SM_PS_STATIC << IEEE80211_HT_CAP_SM_PS_SHIFT; break; case IEEE80211_SMPS_DYNAMIC: cap |= WLAN_HT_CAP_SM_PS_DYNAMIC << IEEE80211_HT_CAP_SM_PS_SHIFT; break; } /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_ht_cap) + 2); ieee80211_ie_build_ht_cap(pos, &ht_cap, cap); } /* This function determines vht capability flags for the association * and builds the IE. * Note - the function returns true to own the MU-MIMO capability */ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_supported_band *sband, struct ieee80211_vht_cap *ap_vht_cap, const struct ieee80211_conn_settings *conn) { struct ieee80211_local *local = sdata->local; u8 *pos; u32 cap; struct ieee80211_sta_vht_cap vht_cap; u32 mask, ap_bf_sts, our_bf_sts; bool mu_mimo_owner = false; BUILD_BUG_ON(sizeof(vht_cap) != sizeof(sband->vht_cap)); memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); /* determine capability flags */ cap = vht_cap.cap; if (conn->bw_limit <= IEEE80211_CONN_BW_LIMIT_80) { cap &= ~IEEE80211_VHT_CAP_SHORT_GI_160; cap &= ~IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; } /* * Some APs apparently get confused if our capabilities are better * than theirs, so restrict what we advertise in the assoc request. */ if (!ieee80211_hw_check(&local->hw, STRICT)) { if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); else if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; } /* * If some other vif is using the MU-MIMO capability we cannot associate * using MU-MIMO - this will lead to contradictions in the group-id * mechanism. * Ownership is defined since association request, in order to avoid * simultaneous associations with MU-MIMO. */ if (cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) { bool disable_mu_mimo = false; struct ieee80211_sub_if_data *other; list_for_each_entry(other, &local->interfaces, list) { if (other->vif.bss_conf.mu_mimo_owner) { disable_mu_mimo = true; break; } } if (disable_mu_mimo) cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; else mu_mimo_owner = true; } mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; ap_bf_sts = le32_to_cpu(ap_vht_cap->vht_cap_info) & mask; our_bf_sts = cap & mask; if (ap_bf_sts < our_bf_sts) { cap &= ~mask; cap |= ap_bf_sts; } /* reserve and fill IE */ pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, cap); return mu_mimo_owner; } static void ieee80211_assoc_add_rates(struct ieee80211_local *local, struct sk_buff *skb, enum nl80211_chan_width width, struct ieee80211_supported_band *sband, struct ieee80211_mgd_assoc_data *assoc_data) { u32 rates; if (assoc_data->supp_rates_len && !ieee80211_hw_check(&local->hw, STRICT)) { /* * Get all rates supported by the device and the AP as * some APs don't like getting a superset of their rates * in the association request (e.g. D-Link DAP 1353 in * b-only mode)... */ ieee80211_parse_bitrates(width, sband, assoc_data->supp_rates, assoc_data->supp_rates_len, &rates); } else { /* * In case AP not provide any supported rates information * before association, we send information element(s) with * all rates that we support. */ rates = ~0; } ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_SUPP_RATES); ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_EXT_SUPP_RATES); } static size_t ieee80211_add_before_ht_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { size_t noffset; static const u8 before_ht[] = { WLAN_EID_SSID, WLAN_EID_SUPP_RATES, WLAN_EID_EXT_SUPP_RATES, WLAN_EID_PWR_CAPABILITY, WLAN_EID_SUPPORTED_CHANNELS, WLAN_EID_RSN, WLAN_EID_QOS_CAPA, WLAN_EID_RRM_ENABLED_CAPABILITIES, WLAN_EID_MOBILITY_DOMAIN, WLAN_EID_FAST_BSS_TRANSITION, /* reassoc only */ WLAN_EID_RIC_DATA, /* reassoc only */ WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; static const u8 after_ric[] = { WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_HT_CAPABILITY, WLAN_EID_BSS_COEX_2040, /* luckily this is almost always there */ WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ WLAN_EID_VHT_CAPABILITY, WLAN_EID_OPMODE_NOTIF, }; if (!elems_len) return offset; noffset = ieee80211_ie_split_ric(elems, elems_len, before_ht, ARRAY_SIZE(before_ht), after_ric, ARRAY_SIZE(after_ric), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_vht_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_vht[] = { /* * no need to list the ones split off before HT * or generated here */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; size_t noffset; if (!elems_len) return offset; /* RIC already taken care of in ieee80211_add_before_ht_elems() */ noffset = ieee80211_ie_split(elems, elems_len, before_vht, ARRAY_SIZE(before_vht), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } static size_t ieee80211_add_before_he_elems(struct sk_buff *skb, const u8 *elems, size_t elems_len, size_t offset) { static const u8 before_he[] = { /* * no need to list the ones split off before VHT * or generated here */ WLAN_EID_OPMODE_NOTIF, WLAN_EID_EXTENSION, WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE, /* 11ai elements */ WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_SESSION, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_PUBLIC_KEY, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_KEY_CONFIRM, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_HLP_CONTAINER, WLAN_EID_EXTENSION, WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN, /* TODO: add 11ah/11aj/11ak elements */ }; size_t noffset; if (!elems_len) return offset; /* RIC already taken care of in ieee80211_add_before_ht_elems() */ noffset = ieee80211_ie_split(elems, elems_len, before_he, ARRAY_SIZE(before_he), offset); skb_put_data(skb, elems + offset, noffset - offset); return noffset; } #define PRESENT_ELEMS_MAX 8 #define PRESENT_ELEM_EXT_OFFS 0x100 static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 capab, const struct element *ext_capa, const u16 *present_elems, struct ieee80211_mgd_assoc_data *assoc_data); static size_t ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 *capab, const struct element *ext_capa, const u8 *extra_elems, size_t extra_elems_len, unsigned int link_id, struct ieee80211_link_data *link, u16 *present_elems, struct ieee80211_mgd_assoc_data *assoc_data) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_channel *chan = cbss->channel; const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20; struct ieee80211_chanctx_conf *chanctx_conf; enum ieee80211_smps_mode smps_mode; u16 orig_capab = *capab; size_t offset = 0; int present_elems_len = 0; u8 *pos; int i; #define ADD_PRESENT_ELEM(id) do { \ /* need a last for termination - we use 0 == SSID */ \ if (!WARN_ON(present_elems_len >= PRESENT_ELEMS_MAX - 1)) \ present_elems[present_elems_len++] = (id); \ } while (0) #define ADD_PRESENT_EXT_ELEM(id) ADD_PRESENT_ELEM(PRESENT_ELEM_EXT_OFFS | (id)) if (link) smps_mode = link->smps_mode; else if (sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_DYNAMIC; else smps_mode = IEEE80211_SMPS_OFF; if (link) { /* * 5/10 MHz scenarios are only viable without MLO, in which * case this pointer should be used ... All of this is a bit * unclear though, not sure this even works at all. */ rcu_read_lock(); chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (chanctx_conf) width = chanctx_conf->def.width; rcu_read_unlock(); } sband = local->hw.wiphy->bands[chan->band]; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (sband->band == NL80211_BAND_2GHZ) { *capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; *capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } if ((cbss->capability & WLAN_CAPABILITY_SPECTRUM_MGMT) && ieee80211_hw_check(&local->hw, SPECTRUM_MGMT)) *capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (sband->band != NL80211_BAND_S1GHZ) ieee80211_assoc_add_rates(local, skb, width, sband, assoc_data); if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT || *capab & WLAN_CAPABILITY_RADIO_MEASURE) { struct cfg80211_chan_def chandef = { .width = width, .chan = chan, }; pos = skb_put(skb, 4); *pos++ = WLAN_EID_PWR_CAPABILITY; *pos++ = 2; *pos++ = 0; /* min tx power */ /* max tx power */ *pos++ = ieee80211_chandef_max_power(&chandef); ADD_PRESENT_ELEM(WLAN_EID_PWR_CAPABILITY); } /* * Per spec, we shouldn't include the list of channels if we advertise * support for extended channel switching, but we've always done that; * (for now?) apply this restriction only on the (new) 6 GHz band. */ if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT && (sband->band != NL80211_BAND_6GHZ || !ext_capa || ext_capa->datalen < 1 || !(ext_capa->data[0] & WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING))) { /* TODO: get this in reg domain format */ pos = skb_put(skb, 2 * sband->n_channels + 2); *pos++ = WLAN_EID_SUPPORTED_CHANNELS; *pos++ = 2 * sband->n_channels; for (i = 0; i < sband->n_channels; i++) { int cf = sband->channels[i].center_freq; *pos++ = ieee80211_frequency_to_channel(cf); *pos++ = 1; /* one channel in the subband*/ } ADD_PRESENT_ELEM(WLAN_EID_SUPPORTED_CHANNELS); } /* if present, add any custom IEs that go before HT */ offset = ieee80211_add_before_ht_elems(skb, extra_elems, extra_elems_len, offset); if (sband->band != NL80211_BAND_6GHZ && assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HT) { ieee80211_add_ht_ie(sdata, skb, assoc_data->link[link_id].ap_ht_param, sband, chan, smps_mode, &assoc_data->link[link_id].conn); ADD_PRESENT_ELEM(WLAN_EID_HT_CAPABILITY); } /* if present, add any custom IEs that go before VHT */ offset = ieee80211_add_before_vht_elems(skb, extra_elems, extra_elems_len, offset); if (sband->band != NL80211_BAND_6GHZ && assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_VHT && sband->vht_cap.vht_supported) { bool mu_mimo_owner = ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->link[link_id].ap_vht_cap, &assoc_data->link[link_id].conn); if (link) link->conf->mu_mimo_owner = mu_mimo_owner; ADD_PRESENT_ELEM(WLAN_EID_VHT_CAPABILITY); } /* if present, add any custom IEs that go before HE */ offset = ieee80211_add_before_he_elems(skb, extra_elems, extra_elems_len, offset); if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_put_he_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY); ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); } /* * careful - need to know about all the present elems before * calling ieee80211_assoc_add_ml_elem(), so add this one if * we're going to put it after the ML element */ if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_EHT_CAPABILITY); if (link_id == assoc_data->assoc_link_id) ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa, present_elems, assoc_data); /* crash if somebody gets it wrong */ present_elems = NULL; if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ieee80211_put_eht_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_add_aid_request_ie(sdata, skb); ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb); } if (iftd && iftd->vendor_elems.data && iftd->vendor_elems.len) skb_put_data(skb, iftd->vendor_elems.data, iftd->vendor_elems.len); return offset; } static void ieee80211_add_non_inheritance_elem(struct sk_buff *skb, const u16 *outer, const u16 *inner) { unsigned int skb_len = skb->len; bool at_extension = false; bool added = false; int i, j; u8 *len, *list_len = NULL; skb_put_u8(skb, WLAN_EID_EXTENSION); len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_NON_INHERITANCE); for (i = 0; i < PRESENT_ELEMS_MAX && outer[i]; i++) { u16 elem = outer[i]; bool have_inner = false; /* should at least be sorted in the sense of normal -> ext */ WARN_ON(at_extension && elem < PRESENT_ELEM_EXT_OFFS); /* switch to extension list */ if (!at_extension && elem >= PRESENT_ELEM_EXT_OFFS) { at_extension = true; if (!list_len) skb_put_u8(skb, 0); list_len = NULL; } for (j = 0; j < PRESENT_ELEMS_MAX && inner[j]; j++) { if (elem == inner[j]) { have_inner = true; break; } } if (have_inner) continue; if (!list_len) { list_len = skb_put(skb, 1); *list_len = 0; } *list_len += 1; skb_put_u8(skb, (u8)elem); added = true; } /* if we added a list but no extension list, make a zero-len one */ if (added && (!at_extension || !list_len)) skb_put_u8(skb, 0); /* if nothing added remove extension element completely */ if (!added) skb_trim(skb, skb_len); else *len = skb->len - skb_len - 2; } static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u16 capab, const struct element *ext_capa, const u16 *outer_present_elems, struct ieee80211_mgd_assoc_data *assoc_data) { struct ieee80211_local *local = sdata->local; struct ieee80211_multi_link_elem *ml_elem; struct ieee80211_mle_basic_common_info *common; const struct wiphy_iftype_ext_capab *ift_ext_capa; __le16 eml_capa = 0, mld_capa_ops = 0; unsigned int link_id; u8 *ml_elem_len; void *capab_pos; if (!ieee80211_vif_is_mld(&sdata->vif)) return; ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, ieee80211_vif_type_p2p(&sdata->vif)); if (ift_ext_capa) { eml_capa = cpu_to_le16(ift_ext_capa->eml_capabilities); mld_capa_ops = cpu_to_le16(ift_ext_capa->mld_capa_and_ops); } skb_put_u8(skb, WLAN_EID_EXTENSION); ml_elem_len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_EHT_MULTI_LINK); ml_elem = skb_put(skb, sizeof(*ml_elem)); ml_elem->control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC | IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP); common = skb_put(skb, sizeof(*common)); common->len = sizeof(*common) + 2; /* MLD capa/ops */ memcpy(common->mld_mac_addr, sdata->vif.addr, ETH_ALEN); /* add EML_CAPA only if needed, see Draft P802.11be_D2.1, 35.3.17 */ if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT))) { common->len += 2; /* EML capabilities */ ml_elem->control |= cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EML_CAPA); skb_put_data(skb, &eml_capa, sizeof(eml_capa)); } skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); /* Many APs have broken parsing of the extended MLD capa/ops field, * dropping (re-)association request frames or replying with association * response with a failure status if it's present. Without a clear * indication as to whether the AP supports parsing this field or not do * not include it in the common information unless strict mode is set. */ if (ieee80211_hw_check(&local->hw, STRICT) && assoc_data->ext_mld_capa_ops) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP); common->len += 2; skb_put_data(skb, &assoc_data->ext_mld_capa_ops, sizeof(assoc_data->ext_mld_capa_ops)); } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { u16 link_present_elems[PRESENT_ELEMS_MAX] = {}; const u8 *extra_elems; size_t extra_elems_len; size_t extra_used; u8 *subelem_len = NULL; __le16 ctrl; if (!assoc_data->link[link_id].bss || link_id == assoc_data->assoc_link_id) continue; extra_elems = assoc_data->link[link_id].elems; extra_elems_len = assoc_data->link[link_id].elems_len; skb_put_u8(skb, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE); subelem_len = skb_put(skb, 1); ctrl = cpu_to_le16(link_id | IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE | IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT); skb_put_data(skb, &ctrl, sizeof(ctrl)); skb_put_u8(skb, 1 + ETH_ALEN); /* STA Info Length */ skb_put_data(skb, assoc_data->link[link_id].addr, ETH_ALEN); /* * Now add the contents of the (re)association request, * but the "listen interval" and "current AP address" * (if applicable) are skipped. So we only have * the capability field (remember the position and fill * later), followed by the elements added below by * calling ieee80211_add_link_elems(). */ capab_pos = skb_put(skb, 2); extra_used = ieee80211_add_link_elems(sdata, skb, &capab, ext_capa, extra_elems, extra_elems_len, link_id, NULL, link_present_elems, assoc_data); if (extra_elems) skb_put_data(skb, extra_elems + extra_used, extra_elems_len - extra_used); put_unaligned_le16(capab, capab_pos); ieee80211_add_non_inheritance_elem(skb, outer_present_elems, link_present_elems); ieee80211_fragment_element(skb, subelem_len, IEEE80211_MLE_SUBELEM_FRAGMENT); } ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT); } static int ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype iftype, struct cfg80211_bss *cbss, size_t elems_len) { struct ieee80211_local *local = sdata->local; const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; size_t size = 0; if (!cbss) return size; sband = local->hw.wiphy->bands[cbss->channel->band]; /* add STA profile elements length */ size += elems_len; /* and supported rates length */ size += 4 + sband->n_bitrates; /* supported channels */ size += 2 + 2 * sband->n_channels; iftd = ieee80211_get_sband_iftype_data(sband, iftype); if (iftd) size += iftd->vendor_elems.len; /* power capability */ size += 4; /* HT, VHT, HE, EHT */ size += 2 + sizeof(struct ieee80211_ht_cap); size += 2 + sizeof(struct ieee80211_vht_cap); size += 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN; if (sband->band == NL80211_BAND_6GHZ) size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + sizeof(struct ieee80211_eht_mcs_nss_supp) + IEEE80211_EHT_PPE_THRES_MAX_LEN; return size; } static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct ieee80211_link_data *link; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos, qos_info, *ie_start; size_t offset, noffset; u16 capab = 0, link_capab; __le16 listen_int; struct element *ext_capa = NULL; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct ieee80211_prep_tx_info info = {}; unsigned int link_id, n_links = 0; u16 present_elems[PRESENT_ELEMS_MAX] = {}; void *capab_pos; size_t size; int ret; /* we know it's writable, cast away the const */ if (assoc_data->ie_len) ext_capa = (void *)cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, assoc_data->ie, assoc_data->ie_len); lockdep_assert_wiphy(sdata->local->hw.wiphy); size = local->hw.extra_tx_headroom + sizeof(*mgmt) + /* bit too much but doesn't matter */ 2 + assoc_data->ssid_len + /* SSID */ assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9; /* WMM */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; size_t elems_len = assoc_data->link[link_id].elems_len; if (!cbss) continue; n_links++; size += ieee80211_link_common_elems_size(sdata, iftype, cbss, elems_len); /* non-inheritance element */ size += 2 + 2 + PRESENT_ELEMS_MAX; /* should be the same across all BSSes */ if (cbss->capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; } if (ieee80211_vif_is_mld(&sdata->vif)) { /* consider the multi-link element with STA profile */ size += sizeof(struct ieee80211_multi_link_elem); /* max common info field in basic multi-link element */ size += sizeof(struct ieee80211_mle_basic_common_info) + 2 + /* capa & op */ 2 + /* ext capa & op */ 2; /* EML capa */ /* * The capability elements were already considered above; * note this over-estimates a bit because there's no * STA profile for the assoc link. */ size += (n_links - 1) * (1 + 1 + /* subelement ID/length */ 2 + /* STA control */ 1 + ETH_ALEN + 2 /* STA Info field */); } link = sdata_dereference(sdata->link[assoc_data->assoc_link_id], sdata); if (WARN_ON(!link)) return -EINVAL; if (WARN_ON(!assoc_data->link[assoc_data->assoc_link_id].bss)) return -EINVAL; skb = alloc_skb(size, GFP_KERNEL); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); if (ifmgd->flags & IEEE80211_STA_ENABLE_RRM) capab |= WLAN_CAPABILITY_RADIO_MEASURE; /* Set MBSSID support for HE AP if needed */ if (ieee80211_hw_check(&local->hw, SUPPORTS_ONLY_HE_MULTI_BSSID) && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && ext_capa && ext_capa->datalen >= 3) ext_capa->data[2] |= WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT; mgmt = skb_put_zero(skb, 24); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); listen_int = cpu_to_le16(assoc_data->s1g ? ieee80211_encode_usf(local->hw.conf.listen_interval) : local->hw.conf.listen_interval); if (!is_zero_ether_addr(assoc_data->prev_ap_addr)) { skb_put(skb, 10); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ); capab_pos = &mgmt->u.reassoc_req.capab_info; mgmt->u.reassoc_req.listen_interval = listen_int; memcpy(mgmt->u.reassoc_req.current_ap, assoc_data->prev_ap_addr, ETH_ALEN); info.subtype = IEEE80211_STYPE_REASSOC_REQ; } else { skb_put(skb, 4); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ); capab_pos = &mgmt->u.assoc_req.capab_info; mgmt->u.assoc_req.listen_interval = listen_int; info.subtype = IEEE80211_STYPE_ASSOC_REQ; } /* SSID */ pos = skb_put(skb, 2 + assoc_data->ssid_len); ie_start = pos; *pos++ = WLAN_EID_SSID; *pos++ = assoc_data->ssid_len; memcpy(pos, assoc_data->ssid, assoc_data->ssid_len); /* * This bit is technically reserved, so it shouldn't matter for either * the AP or us, but it also means we shouldn't set it. However, we've * always set it in the past, and apparently some EHT APs check that * we don't set it. To avoid interoperability issues with old APs that * for some reason check it and want it to be set, set the bit for all * pre-EHT connections as we used to do. */ if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT && !ieee80211_hw_check(&local->hw, STRICT)) capab |= WLAN_CAPABILITY_ESS; /* add the elements for the assoc (main) link */ link_capab = capab; offset = ieee80211_add_link_elems(sdata, skb, &link_capab, ext_capa, assoc_data->ie, assoc_data->ie_len, assoc_data->assoc_link_id, link, present_elems, assoc_data); put_unaligned_le16(link_capab, capab_pos); /* if present, add any custom non-vendor IEs */ if (assoc_data->ie_len) { noffset = ieee80211_ie_split_vendor(assoc_data->ie, assoc_data->ie_len, offset); skb_put_data(skb, assoc_data->ie + offset, noffset - offset); offset = noffset; } if (assoc_data->wmm) { if (assoc_data->uapsd) { qos_info = ifmgd->uapsd_queues; qos_info |= (ifmgd->uapsd_max_sp_len << IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); } else { qos_info = 0; } pos = ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info); } /* add any remaining custom (i.e. vendor specific here) IEs */ if (assoc_data->ie_len) { noffset = assoc_data->ie_len; skb_put_data(skb, assoc_data->ie + offset, noffset - offset); } if (assoc_data->fils_kek_len) { ret = fils_encrypt_assoc_req(skb, assoc_data); if (ret < 0) { dev_kfree_skb(skb); return ret; } } pos = skb_tail_pointer(skb); kfree(ifmgd->assoc_req_ies); ifmgd->assoc_req_ies = kmemdup(ie_start, pos - ie_start, GFP_ATOMIC); if (!ifmgd->assoc_req_ies) { dev_kfree_skb(skb); return -ENOMEM; } ifmgd->assoc_req_ies_len = pos - ie_start; info.link_id = assoc_data->assoc_link_id; drv_mgd_prepare_tx(local, sdata, &info); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_tx_skb(sdata, skb); return 0; } void ieee80211_send_pspoll(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_pspoll *pspoll; struct sk_buff *skb; skb = ieee80211_pspoll_get(&local->hw, &sdata->vif); if (!skb) return; pspoll = (struct ieee80211_pspoll *) skb->data; pspoll->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, bool powersave) { struct sk_buff *skb; struct ieee80211_hdr_3addr *nullfunc; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, -1, !ieee80211_hw_check(&local->hw, DOESNT_SUPPORT_QOS_NDP)); if (!skb) return; nullfunc = (struct ieee80211_hdr_3addr *) skb->data; if (powersave) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_PM); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE; ieee80211_tx_skb(sdata, skb); } void ieee80211_send_4addr_nullfunc(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct sk_buff *skb; struct ieee80211_hdr *nullfunc; __le16 fc; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; skb = dev_alloc_skb(local->hw.extra_tx_headroom + 30); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put_zero(skb, 30); fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); nullfunc->frame_control = fc; memcpy(nullfunc->addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->deflink.u.mgd.bssid, ETH_ALEN); memcpy(nullfunc->addr4, sdata->vif.addr, ETH_ALEN); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_USE_MINRATE; ieee80211_tx_skb(sdata, skb); } /* spectrum management related things */ static void ieee80211_csa_switch_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, u.mgd.csa.switch_work.work); struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ret; if (!ieee80211_sdata_running(sdata)) return; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; if (!link->conf->csa_active) return; /* * If the link isn't active (now), we cannot wait for beacons, won't * have a reserved chanctx, etc. Just switch over the chandef and * update cfg80211 directly. */ if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) { link->conf->chanreq = link->csa.chanreq; cfg80211_ch_switch_notify(sdata->dev, &link->csa.chanreq.oper, link->link_id); return; } /* * using reservation isn't immediate as it may be deferred until later * with multi-vif. once reservation is complete it will re-schedule the * work with no reserved_chanctx so verify chandef to check if it * completed successfully */ if (link->reserved_chanctx) { /* * with multi-vif csa driver may call ieee80211_csa_finish() * many times while waiting for other interfaces to use their * reservations */ if (link->reserved_ready) return; ret = ieee80211_link_use_reserved_context(link); if (ret) { link_info(link, "failed to use reserved channel context, disconnecting (err=%d)\n", ret); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); } return; } if (!ieee80211_chanreq_identical(&link->conf->chanreq, &link->csa.chanreq)) { link_info(link, "failed to finalize channel switch, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } link->u.mgd.csa.waiting_bcn = true; /* apply new TPE restrictions immediately on the new channel */ if (link->u.mgd.csa.ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_rearrange_tpe(&link->u.mgd.csa.tpe, &link->u.mgd.csa.ap_chandef, &link->conf->chanreq.oper); if (memcmp(&link->conf->tpe, &link->u.mgd.csa.tpe, sizeof(link->u.mgd.csa.tpe))) { link->conf->tpe = link->u.mgd.csa.tpe; ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_TPE); } } ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } static void ieee80211_chswitch_post_beacon(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!link->conf->csa_active); ieee80211_vif_unblock_queues_csa(sdata); link->conf->csa_active = false; link->u.mgd.csa.blocked_tx = false; link->u.mgd.csa.waiting_bcn = false; ret = drv_post_channel_switch(link); if (ret) { link_info(link, "driver post channel switch failed, disconnecting\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } cfg80211_ch_switch_notify(sdata->dev, &link->conf->chanreq.oper, link->link_id); } void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_chswitch_done(sdata, success, link_id); rcu_read_lock(); if (!success) { sdata_info(sdata, "driver channel switch failed (link %d), disconnecting\n", link_id); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.csa_connection_drop_work); } else { struct ieee80211_link_data *link = rcu_dereference(sdata->link[link_id]); if (WARN_ON(!link)) { rcu_read_unlock(); return; } wiphy_delayed_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); } rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_chswitch_done); static void ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->abort_channel_switch) return; ieee80211_link_unreserve_chanctx(link); ieee80211_vif_unblock_queues_csa(sdata); link->conf->csa_active = false; link->u.mgd.csa.blocked_tx = false; drv_abort_channel_switch(link); } struct sta_csa_rnr_iter_data { struct ieee80211_link_data *link; struct ieee80211_channel *chan; u8 mld_id; }; static enum cfg80211_rnr_iter_ret ieee80211_sta_csa_rnr_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct sta_csa_rnr_iter_data *data = _data; struct ieee80211_link_data *link = data->link; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; const struct ieee80211_tbtt_info_ge_11 *ti; enum nl80211_band band; unsigned int center_freq; int link_id; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (tbtt_info_len < sizeof(*ti)) return RNR_ITER_CONTINUE; ti = (const void *)tbtt_info; if (ti->mld_params.mld_id != data->mld_id) return RNR_ITER_CONTINUE; link_id = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); if (link_id != data->link->link_id) return RNR_ITER_CONTINUE; /* we found the entry for our link! */ /* this AP is confused, it had this right before ... just disconnect */ if (!ieee80211_operating_class_to_band(info->op_class, &band)) { link_info(link, "AP now has invalid operating class in RNR, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return RNR_ITER_BREAK; } center_freq = ieee80211_channel_to_frequency(info->channel, band); data->chan = ieee80211_get_channel(sdata->local->hw.wiphy, center_freq); return RNR_ITER_BREAK; } static void ieee80211_sta_other_link_csa_disappeared(struct ieee80211_link_data *link, struct ieee802_11_elems *elems) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct sta_csa_rnr_iter_data data = { .link = link, }; /* * If we get here, we see a beacon from another link without * CSA still being reported for it, so now we have to check * if the CSA was aborted or completed. This may not even be * perfectly possible if the CSA was only done for changing * the puncturing, but in that case if the link in inactive * we don't really care, and if it's an active link (or when * it's activated later) we'll get a beacon and adjust. */ if (WARN_ON(!elems->ml_basic)) return; data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); /* * So in order to do this, iterate the RNR element(s) and see * what channel is reported now. */ cfg80211_iter_rnr(elems->ie_start, elems->total_len, ieee80211_sta_csa_rnr_iter, &data); if (!data.chan) { link_info(link, "couldn't find (valid) channel in RNR for CSA, disconnect\n"); wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); return; } /* * If it doesn't match the CSA, then assume it aborted. This * may erroneously detect that it was _not_ aborted when it * was in fact aborted, but only changed the bandwidth or the * puncturing configuration, but we don't have enough data to * detect that. */ if (data.chan != link->csa.chanreq.oper.chan) ieee80211_sta_abort_chanswitch(link); } enum ieee80211_csa_source { IEEE80211_CSA_SOURCE_BEACON, IEEE80211_CSA_SOURCE_OTHER_LINK, IEEE80211_CSA_SOURCE_PROT_ACTION, IEEE80211_CSA_SOURCE_UNPROT_ACTION, }; static void ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, u64 timestamp, u32 device_timestamp, struct ieee802_11_elems *full_elems, struct ieee802_11_elems *csa_elems, enum ieee80211_csa_source source) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_chanctx *chanctx = NULL; struct ieee80211_chanctx_conf *conf; struct ieee80211_csa_ie csa_ie = {}; struct ieee80211_channel_switch ch_switch = { .link_id = link->link_id, .timestamp = timestamp, .device_timestamp = device_timestamp, }; unsigned long now; int res; lockdep_assert_wiphy(local->hw.wiphy); if (csa_elems) { struct cfg80211_bss *cbss = link->conf->bss; enum nl80211_band current_band; struct ieee80211_bss *bss; if (WARN_ON(!cbss)) return; current_band = cbss->channel->band; bss = (void *)cbss->priv; res = ieee80211_parse_ch_switch_ie(sdata, csa_elems, current_band, bss->vht_cap_info, &link->u.mgd.conn, link->u.mgd.bssid, source == IEEE80211_CSA_SOURCE_UNPROT_ACTION, &csa_ie); if (res == 0) { ch_switch.block_tx = csa_ie.mode; ch_switch.chandef = csa_ie.chanreq.oper; ch_switch.count = csa_ie.count; ch_switch.delay = csa_ie.max_switch_time; } link->u.mgd.csa.tpe = csa_elems->csa_tpe; } else { /* * If there was no per-STA profile for this link, we * get called with csa_elems == NULL. This of course means * there are no CSA elements, so set res=1 indicating * no more CSA. */ res = 1; } if (res < 0) { /* ignore this case, not a protected frame */ if (source == IEEE80211_CSA_SOURCE_UNPROT_ACTION) return; goto drop_connection; } if (link->conf->csa_active) { switch (source) { case IEEE80211_CSA_SOURCE_PROT_ACTION: case IEEE80211_CSA_SOURCE_UNPROT_ACTION: /* already processing - disregard action frames */ return; case IEEE80211_CSA_SOURCE_BEACON: if (link->u.mgd.csa.waiting_bcn) { ieee80211_chswitch_post_beacon(link); /* * If the CSA is still present after the switch * we need to consider it as a new CSA (possibly * to self). This happens by not returning here * so we'll get to the check below. */ } else if (res) { ieee80211_sta_abort_chanswitch(link); return; } else { drv_channel_switch_rx_beacon(sdata, &ch_switch); return; } break; case IEEE80211_CSA_SOURCE_OTHER_LINK: /* active link: we want to see the beacon to continue */ if (ieee80211_vif_link_active(&sdata->vif, link->link_id)) return; /* switch work ran, so just complete the process */ if (link->u.mgd.csa.waiting_bcn) { ieee80211_chswitch_post_beacon(link); /* * If the CSA is still present after the switch * we need to consider it as a new CSA (possibly * to self). This happens by not returning here * so we'll get to the check below. */ break; } /* link still has CSA but we already know, do nothing */ if (!res) return; /* check in the RNR if the CSA aborted */ ieee80211_sta_other_link_csa_disappeared(link, full_elems); return; } } /* no active CSA nor a new one */ if (res) { /* * However, we may have stopped queues when receiving a public * action frame that couldn't be protected, if it had the quiet * bit set. This is a trade-off, we want to be quiet as soon as * possible, but also don't trust the public action frame much, * as it can't be protected. */ if (unlikely(link->u.mgd.csa.blocked_tx)) { link->u.mgd.csa.blocked_tx = false; ieee80211_vif_unblock_queues_csa(sdata); } return; } /* * We don't really trust public action frames, but block queues (go to * quiet mode) for them anyway, we should get a beacon soon to either * know what the CSA really is, or figure out the public action frame * was actually an attack. */ if (source == IEEE80211_CSA_SOURCE_UNPROT_ACTION) { if (csa_ie.mode) { link->u.mgd.csa.blocked_tx = true; ieee80211_vif_block_queues_csa(sdata); } return; } if (link->conf->chanreq.oper.chan->band != csa_ie.chanreq.oper.chan->band) { link_info(link, "AP %pM switches to different band (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", link->u.mgd.bssid, csa_ie.chanreq.oper.chan->center_freq, csa_ie.chanreq.oper.width, csa_ie.chanreq.oper.center_freq1, csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chanreq.oper, IEEE80211_CHAN_DISABLED)) { link_info(link, "AP %pM switches to unsupported channel (%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), disconnecting\n", link->u.mgd.bssid, csa_ie.chanreq.oper.chan->center_freq, csa_ie.chanreq.oper.chan->freq_offset, csa_ie.chanreq.oper.width, csa_ie.chanreq.oper.center_freq1, csa_ie.chanreq.oper.freq1_offset, csa_ie.chanreq.oper.center_freq2); goto drop_connection; } if (cfg80211_chandef_identical(&csa_ie.chanreq.oper, &link->conf->chanreq.oper) && (!csa_ie.mode || source != IEEE80211_CSA_SOURCE_BEACON)) { if (link->u.mgd.csa.ignored_same_chan) return; link_info(link, "AP %pM tries to chanswitch to same channel, ignore\n", link->u.mgd.bssid); link->u.mgd.csa.ignored_same_chan = true; return; } /* * Drop all TDLS peers on the affected link - either we disconnect or * move to a different channel from this point on. There's no telling * what our peer will do. * The TDLS WIDER_BW scenario is also problematic, as peers might now * have an incompatible wider chandef. */ ieee80211_teardown_tdls_peers(link); conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && !conf) { link_info(link, "no channel context assigned to vif?, disconnecting\n"); goto drop_connection; } if (conf) chanctx = container_of(conf, struct ieee80211_chanctx, conf); if (!ieee80211_hw_check(&local->hw, CHANCTX_STA_CSA)) { link_info(link, "driver doesn't support chan-switch with channel contexts\n"); goto drop_connection; } if (drv_pre_channel_switch(sdata, &ch_switch)) { link_info(link, "preparing for channel switch failed, disconnecting\n"); goto drop_connection; } link->u.mgd.csa.ap_chandef = csa_ie.chanreq.ap; link->csa.chanreq.oper = csa_ie.chanreq.oper; ieee80211_set_chanreq_ap(sdata, &link->csa.chanreq, &link->u.mgd.conn, &csa_ie.chanreq.ap); if (chanctx) { res = ieee80211_link_reserve_chanctx(link, &link->csa.chanreq, chanctx->mode, false); if (res) { link_info(link, "failed to reserve channel context for channel switch, disconnecting (err=%d)\n", res); goto drop_connection; } } link->conf->csa_active = true; link->u.mgd.csa.ignored_same_chan = false; link->u.mgd.beacon_crc_valid = false; link->u.mgd.csa.blocked_tx = csa_ie.mode; if (csa_ie.mode) ieee80211_vif_block_queues_csa(sdata); cfg80211_ch_switch_started_notify(sdata->dev, &csa_ie.chanreq.oper, link->link_id, csa_ie.count, csa_ie.mode); /* we may have to handle timeout for deactivated link in software */ now = jiffies; link->u.mgd.csa.time = now + TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * link->conf->beacon_int); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && local->ops->channel_switch) { /* * Use driver's channel switch callback, the driver will * later call ieee80211_chswitch_done(). It may deactivate * the link as well, we handle that elsewhere and queue * the csa.switch_work for the calculated time then. */ drv_channel_switch(local, sdata, &ch_switch); return; } /* channel switch handled in software */ wiphy_delayed_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - now); return; drop_connection: /* * This is just so that the disconnect flow will know that * we were trying to switch channel and failed. In case the * mode is 1 (we are not allowed to Tx), we will know not to * send a deauthentication frame. Those two fields will be * reset when the disconnection worker runs. */ link->conf->csa_active = true; link->u.mgd.csa.blocked_tx = csa_ie.mode; wiphy_work_queue(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); } struct sta_bss_param_ch_cnt_data { struct ieee80211_sub_if_data *sdata; u8 reporting_link_id; u8 mld_id; }; static enum cfg80211_rnr_iter_ret ieee80211_sta_bss_param_ch_cnt_iter(void *_data, u8 type, const struct ieee80211_neighbor_ap_info *info, const u8 *tbtt_info, u8 tbtt_info_len) { struct sta_bss_param_ch_cnt_data *data = _data; struct ieee80211_sub_if_data *sdata = data->sdata; const struct ieee80211_tbtt_info_ge_11 *ti; u8 bss_param_ch_cnt; int link_id; if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) return RNR_ITER_CONTINUE; if (tbtt_info_len < sizeof(*ti)) return RNR_ITER_CONTINUE; ti = (const void *)tbtt_info; if (ti->mld_params.mld_id != data->mld_id) return RNR_ITER_CONTINUE; link_id = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_LINK_ID); bss_param_ch_cnt = le16_get_bits(ti->mld_params.params, IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); if (bss_param_ch_cnt != 255 && link_id < ARRAY_SIZE(sdata->link)) { struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); if (link && link->conf->bss_param_ch_cnt != bss_param_ch_cnt) { link->conf->bss_param_ch_cnt = bss_param_ch_cnt; link->conf->bss_param_ch_cnt_link_id = data->reporting_link_id; } } return RNR_ITER_CONTINUE; } static void ieee80211_mgd_update_bss_param_ch_cnt(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf, struct ieee802_11_elems *elems) { struct sta_bss_param_ch_cnt_data data = { .reporting_link_id = bss_conf->link_id, .sdata = sdata, }; int bss_param_ch_cnt; if (!elems->ml_basic) return; data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); cfg80211_iter_rnr(elems->ie_start, elems->total_len, ieee80211_sta_bss_param_ch_cnt_iter, &data); bss_param_ch_cnt = ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); /* * Update bss_param_ch_cnt_link_id even if bss_param_ch_cnt * didn't change to indicate that we got a beacon on our own * link. */ if (bss_param_ch_cnt >= 0 && bss_param_ch_cnt != 255) { bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = bss_conf->link_id; } } static bool ieee80211_find_80211h_pwr_constr(struct ieee80211_channel *channel, const u8 *country_ie, u8 country_ie_len, const u8 *pwr_constr_elem, int *chan_pwr, int *pwr_reduction) { struct ieee80211_country_ie_triplet *triplet; int chan = ieee80211_frequency_to_channel(channel->center_freq); int i, chan_increment; bool have_chan_pwr = false; /* Invalid IE */ if (country_ie_len % 2 || country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) return false; triplet = (void *)(country_ie + 3); country_ie_len -= 3; switch (channel->band) { default: WARN_ON_ONCE(1); fallthrough; case NL80211_BAND_2GHZ: case NL80211_BAND_60GHZ: case NL80211_BAND_LC: chan_increment = 1; break; case NL80211_BAND_5GHZ: chan_increment = 4; break; case NL80211_BAND_6GHZ: /* * In the 6 GHz band, the "maximum transmit power level" * field in the triplets is reserved, and thus will be * zero and we shouldn't use it to control TX power. * The actual TX power will be given in the transmit * power envelope element instead. */ return false; } /* find channel */ while (country_ie_len >= 3) { u8 first_channel = triplet->chans.first_channel; if (first_channel >= IEEE80211_COUNTRY_EXTENSION_ID) goto next; for (i = 0; i < triplet->chans.num_channels; i++) { if (first_channel + i * chan_increment == chan) { have_chan_pwr = true; *chan_pwr = triplet->chans.max_power; break; } } if (have_chan_pwr) break; next: triplet++; country_ie_len -= 3; } if (have_chan_pwr && pwr_constr_elem) *pwr_reduction = *pwr_constr_elem; else *pwr_reduction = 0; return have_chan_pwr; } static void ieee80211_find_cisco_dtpc(struct ieee80211_channel *channel, const u8 *cisco_dtpc_ie, int *pwr_level) { /* From practical testing, the first data byte of the DTPC element * seems to contain the requested dBm level, and the CLI on Cisco * APs clearly state the range is -127 to 127 dBm, which indicates * a signed byte, although it seemingly never actually goes negative. * The other byte seems to always be zero. */ *pwr_level = (__s8)cisco_dtpc_ie[4]; } static u64 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, struct ieee80211_channel *channel, struct ieee80211_mgmt *mgmt, const u8 *country_ie, u8 country_ie_len, const u8 *pwr_constr_ie, const u8 *cisco_dtpc_ie) { struct ieee80211_sub_if_data *sdata = link->sdata; bool has_80211h_pwr = false, has_cisco_pwr = false; int chan_pwr = 0, pwr_reduction_80211h = 0; int pwr_level_cisco, pwr_level_80211h; int new_ap_level; __le16 capab = mgmt->u.probe_resp.capab_info; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) return 0; /* TODO */ if (country_ie && (capab & cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT) || capab & cpu_to_le16(WLAN_CAPABILITY_RADIO_MEASURE))) { has_80211h_pwr = ieee80211_find_80211h_pwr_constr( channel, country_ie, country_ie_len, pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); pwr_level_80211h = max_t(int, 0, chan_pwr - pwr_reduction_80211h); } if (cisco_dtpc_ie) { ieee80211_find_cisco_dtpc( channel, cisco_dtpc_ie, &pwr_level_cisco); has_cisco_pwr = true; } if (!has_80211h_pwr && !has_cisco_pwr) return 0; /* If we have both 802.11h and Cisco DTPC, apply both limits * by picking the smallest of the two power levels advertised. */ if (has_80211h_pwr && (!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) { new_ap_level = pwr_level_80211h; if (link->ap_power_level == new_ap_level) return 0; sdata_dbg(sdata, "Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n", pwr_level_80211h, chan_pwr, pwr_reduction_80211h, link->u.mgd.bssid); } else { /* has_cisco_pwr is always true here. */ new_ap_level = pwr_level_cisco; if (link->ap_power_level == new_ap_level) return 0; sdata_dbg(sdata, "Limiting TX power to %d dBm as advertised by %pM\n", pwr_level_cisco, link->u.mgd.bssid); } link->ap_power_level = new_ap_level; if (__ieee80211_recalc_txpower(link)) return BSS_CHANGED_TXPOWER; return 0; } /* powersave */ static void ieee80211_enable_ps(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_conf *conf = &local->hw.conf; /* * If we are scanning right now then the parameters will * take effect when scan finishes. */ if (local->scanning) return; if (conf->dynamic_ps_timeout > 0 && !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(conf->dynamic_ps_timeout)); } else { if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) ieee80211_send_nullfunc(local, sdata, true); if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) return; conf->flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } } static void ieee80211_change_ps(struct ieee80211_local *local) { struct ieee80211_conf *conf = &local->hw.conf; if (local->ps_sdata) { ieee80211_enable_ps(local, local->ps_sdata); } else if (conf->flags & IEEE80211_CONF_PS) { conf->flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); } } static bool ieee80211_powersave_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *mgd = &sdata->u.mgd; struct sta_info *sta = NULL; bool authorized = false; if (!mgd->powersave) return false; if (mgd->broken_ap) return false; if (!mgd->associated) return false; if (mgd->flags & IEEE80211_STA_CONNECTION_POLL) return false; if (!(local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) && !sdata->deflink.u.mgd.have_beacon) return false; rcu_read_lock(); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (sta) authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED); rcu_read_unlock(); return authorized; } /* need to hold RTNL or interface lock */ void ieee80211_recalc_ps(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata, *found = NULL; int count = 0; int timeout; if (!ieee80211_hw_check(&local->hw, SUPPORTS_PS) || ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) { local->ps_sdata = NULL; return; } list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_AP) { /* If an AP vif is found, then disable PS * by setting the count to zero thereby setting * ps_sdata to NULL. */ count = 0; break; } if (sdata->vif.type != NL80211_IFTYPE_STATION) continue; found = sdata; count++; } if (count == 1 && ieee80211_powersave_allowed(found)) { u8 dtimper = found->deflink.u.mgd.dtim_period; timeout = local->dynamic_ps_forced_timeout; if (timeout < 0) timeout = 100; local->hw.conf.dynamic_ps_timeout = timeout; /* If the TIM IE is invalid, pretend the value is 1 */ if (!dtimper) dtimper = 1; local->hw.conf.ps_dtim_period = dtimper; local->ps_sdata = found; } else { local->ps_sdata = NULL; } ieee80211_change_ps(local); } void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata) { bool ps_allowed = ieee80211_powersave_allowed(sdata); if (sdata->vif.cfg.ps != ps_allowed) { sdata->vif.cfg.ps = ps_allowed; ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_PS); } } void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, dynamic_ps_disable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_PS, false); } void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, dynamic_ps_enable_work); struct ieee80211_sub_if_data *sdata = local->ps_sdata; struct ieee80211_if_managed *ifmgd; unsigned long flags; int q; /* can only happen when PS was just disabled anyway */ if (!sdata) return; ifmgd = &sdata->u.mgd; if (local->hw.conf.flags & IEEE80211_CONF_PS) return; if (local->hw.conf.dynamic_ps_timeout > 0) { /* don't enter PS if TX frames are pending */ if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); return; } /* * transmission can be stopped by others which leads to * dynamic_ps_timer expiry. Postpone the ps timer if it * is not the actual idle state. */ spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (q = 0; q < local->hw.queues; q++) { if (local->queue_stop_reasons[q]) { spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); return; } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && !(ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { if (drv_tx_frames_pending(local)) { mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies( local->hw.conf.dynamic_ps_timeout)); } else { ieee80211_send_nullfunc(local, sdata, true); /* Flush to get the tx status of nullfunc frame */ ieee80211_flush_queues(local, sdata, false); } } if (!(ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) || (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } } void ieee80211_dynamic_ps_timer(struct timer_list *t) { struct ieee80211_local *local = from_timer(local, t, dynamic_ps_timer); wiphy_work_queue(local->hw.wiphy, &local->dynamic_ps_enable_work); } void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, dfs_cac_timer_work.work); struct cfg80211_chan_def chandef = link->conf->chanreq.oper; struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (sdata->wdev.links[link->link_id].cac_started) { ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_FINISHED, GFP_KERNEL, link->link_id); } } static bool __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool ret = false; int ac; if (local->hw.queues < IEEE80211_NUM_ACS) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; int non_acm_ac; unsigned long now = jiffies; if (tx_tspec->action == TX_TSPEC_ACTION_NONE && tx_tspec->admitted_time && time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->consumed_tx_time = 0; tx_tspec->time_slice_start = now; if (tx_tspec->downgraded) tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; } switch (tx_tspec->action) { case TX_TSPEC_ACTION_STOP_DOWNGRADE: /* take the original parameters */ if (drv_conf_tx(local, &sdata->deflink, ac, &sdata->deflink.tx_conf[ac])) link_err(&sdata->deflink, "failed to set TX queue parameters for queue %d\n", ac); tx_tspec->action = TX_TSPEC_ACTION_NONE; tx_tspec->downgraded = false; ret = true; break; case TX_TSPEC_ACTION_DOWNGRADE: if (time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->action = TX_TSPEC_ACTION_NONE; ret = true; break; } /* downgrade next lower non-ACM AC */ for (non_acm_ac = ac + 1; non_acm_ac < IEEE80211_NUM_ACS; non_acm_ac++) if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac))) break; /* Usually the loop will result in using BK even if it * requires admission control, but such a configuration * makes no sense and we have to transmit somehow - the * AC selection does the same thing. * If we started out trying to downgrade from BK, then * the extra condition here might be needed. */ if (non_acm_ac >= IEEE80211_NUM_ACS) non_acm_ac = IEEE80211_AC_BK; if (drv_conf_tx(local, &sdata->deflink, ac, &sdata->deflink.tx_conf[non_acm_ac])) link_err(&sdata->deflink, "failed to set TX queue parameters for queue %d\n", ac); tx_tspec->action = TX_TSPEC_ACTION_NONE; ret = true; wiphy_delayed_work_queue(local->hw.wiphy, &ifmgd->tx_tspec_wk, tx_tspec->time_slice_start + HZ - now + 1); break; case TX_TSPEC_ACTION_NONE: /* nothing now */ break; } } return ret; } void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) { if (__ieee80211_sta_handle_tspec_ac_params(sdata)) ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_QOS); } static void ieee80211_sta_handle_tspec_ac_params_wk(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata; sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.tx_tspec_wk.work); ieee80211_sta_handle_tspec_ac_params(sdata); } void ieee80211_mgd_set_link_qos_params(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_tx_queue_params *params = link->tx_conf; u8 ac; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { mlme_dbg(sdata, "WMM AC=%d acm=%d aifs=%d cWmin=%d cWmax=%d txop=%d uapsd=%d, downgraded=%d\n", ac, params[ac].acm, params[ac].aifs, params[ac].cw_min, params[ac].cw_max, params[ac].txop, params[ac].uapsd, ifmgd->tx_tspec[ac].downgraded); if (!ifmgd->tx_tspec[ac].downgraded && drv_conf_tx(local, link, ac, ¶ms[ac])) link_err(link, "failed to set TX queue parameters for AC %d\n", ac); } } /* MLME */ static bool _ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_link_data *link, const u8 *wmm_param, size_t wmm_param_len, const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t left; int count, mu_edca_count, ac; const u8 *pos; u8 uapsd_queues = 0; if (!local->ops->conf_tx) return false; if (local->hw.queues < IEEE80211_NUM_ACS) return false; if (!wmm_param) return false; if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) return false; if (ifmgd->flags & IEEE80211_STA_UAPSD_ENABLED) uapsd_queues = ifmgd->uapsd_queues; count = wmm_param[6] & 0x0f; /* -1 is the initial value of ifmgd->mu_edca_last_param_set. * if mu_edca was preset before and now it disappeared tell * the driver about it. */ mu_edca_count = mu_edca ? mu_edca->mu_qos_info & 0x0f : -1; if (count == link->u.mgd.wmm_last_param_set && mu_edca_count == link->u.mgd.mu_edca_last_param_set) return false; link->u.mgd.wmm_last_param_set = count; link->u.mgd.mu_edca_last_param_set = mu_edca_count; pos = wmm_param + 8; left = wmm_param_len - 8; memset(¶ms, 0, sizeof(params)); sdata->wmm_acm = 0; for (; left >= 4; left -= 4, pos += 4) { int aci = (pos[0] >> 5) & 0x03; int acm = (pos[0] >> 4) & 0x01; bool uapsd = false; switch (aci) { case 1: /* AC_BK */ ac = IEEE80211_AC_BK; if (acm) sdata->wmm_acm |= BIT(1) | BIT(2); /* BK/- */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BK) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_bk; break; case 2: /* AC_VI */ ac = IEEE80211_AC_VI; if (acm) sdata->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VI) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_vi; break; case 3: /* AC_VO */ ac = IEEE80211_AC_VO; if (acm) sdata->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_VO) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_vo; break; case 0: /* AC_BE */ default: ac = IEEE80211_AC_BE; if (acm) sdata->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */ if (uapsd_queues & IEEE80211_WMM_IE_STA_QOSINFO_AC_BE) uapsd = true; params[ac].mu_edca = !!mu_edca; if (mu_edca) params[ac].mu_edca_param_rec = mu_edca->ac_be; break; } params[ac].aifs = pos[0] & 0x0f; if (params[ac].aifs < 2) { link_info(link, "AP has invalid WMM params (AIFSN=%d for ACI %d), will use 2\n", params[ac].aifs, aci); params[ac].aifs = 2; } params[ac].cw_max = ecw2cw((pos[1] & 0xf0) >> 4); params[ac].cw_min = ecw2cw(pos[1] & 0x0f); params[ac].txop = get_unaligned_le16(pos + 2); params[ac].acm = acm; params[ac].uapsd = uapsd; if (params[ac].cw_min == 0 || params[ac].cw_min > params[ac].cw_max) { link_info(link, "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n", params[ac].cw_min, params[ac].cw_max, aci); return false; } ieee80211_regulatory_limit_wmm_params(sdata, ¶ms[ac], ac); } /* WMM specification requires all 4 ACIs. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (params[ac].cw_min == 0) { link_info(link, "AP has invalid WMM params (missing AC %d), using defaults\n", ac); return false; } } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link->tx_conf[ac] = params[ac]; return true; } static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, struct ieee80211_link_data *link, const u8 *wmm_param, size_t wmm_param_len, const struct ieee80211_mu_edca_param_set *mu_edca) { if (!_ieee80211_sta_wmm_params(local, link, wmm_param, wmm_param_len, mu_edca)) return false; ieee80211_mgd_set_link_qos_params(link); /* enable WMM or activate new settings */ link->conf->qos = true; return true; } static void __ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata) { lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.flags &= ~IEEE80211_STA_CONNECTION_POLL; ieee80211_run_deferred_scan(sdata->local); } static void ieee80211_stop_poll(struct ieee80211_sub_if_data *sdata) { lockdep_assert_wiphy(sdata->local->hw.wiphy); __ieee80211_stop_poll(sdata); } static u64 ieee80211_handle_bss_capability(struct ieee80211_link_data *link, u16 capab, bool erp_valid, u8 erp) { struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_supported_band *sband; u64 changed = 0; bool use_protection; bool use_short_preamble; bool use_short_slot; sband = ieee80211_get_link_sband(link); if (!sband) return changed; if (erp_valid) { use_protection = (erp & WLAN_ERP_USE_PROTECTION) != 0; use_short_preamble = (erp & WLAN_ERP_BARKER_PREAMBLE) == 0; } else { use_protection = false; use_short_preamble = !!(capab & WLAN_CAPABILITY_SHORT_PREAMBLE); } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); if (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { bss_conf->use_cts_prot = use_protection; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (use_short_preamble != bss_conf->use_short_preamble) { bss_conf->use_short_preamble = use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } if (use_short_slot != bss_conf->use_short_slot) { bss_conf->use_short_slot = use_short_slot; changed |= BSS_CHANGED_ERP_SLOT; } return changed; } static u64 ieee80211_link_set_associated(struct ieee80211_link_data *link, struct cfg80211_bss *cbss) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_bss *bss = (void *)cbss->priv; u64 changed = BSS_CHANGED_QOS; /* not really used in MLO */ sdata->u.mgd.beacon_timeout = usecs_to_jiffies(ieee80211_tu_to_usec(beacon_loss_count * bss_conf->beacon_int)); changed |= ieee80211_handle_bss_capability(link, bss_conf->assoc_capability, bss->has_erp_value, bss->erp_value); ieee80211_check_rate_mask(link); link->conf->bss = cbss; memcpy(link->u.mgd.bssid, cbss->bssid, ETH_ALEN); if (sdata->vif.p2p || sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) { int ret; ret = cfg80211_get_p2p_attr( ies->data, ies->len, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, (u8 *) &bss_conf->p2p_noa_attr, sizeof(bss_conf->p2p_noa_attr)); if (ret >= 2) { link->u.mgd.p2p_noa_index = bss_conf->p2p_noa_attr.index; changed |= BSS_CHANGED_P2P_PS; } } rcu_read_unlock(); } if (link->u.mgd.have_beacon) { bss_conf->beacon_rate = bss->beacon_rate; changed |= BSS_CHANGED_BEACON_INFO; } else { bss_conf->beacon_rate = NULL; } /* Tell the driver to monitor connection quality (if supported) */ if (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI && bss_conf->cqm_rssi_thold) changed |= BSS_CHANGED_CQM; return changed; } static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, u64 changed[IEEE80211_MLD_MAX_NUM_LINKS]) { struct ieee80211_local *local = sdata->local; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; u64 vif_changed = BSS_CHANGED_ASSOC; unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); sdata->u.mgd.associated = true; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; if (!cbss || assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; if (ieee80211_vif_is_mld(&sdata->vif) && !(ieee80211_vif_usable_links(&sdata->vif) & BIT(link_id))) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; changed[link_id] |= ieee80211_link_set_associated(link, cbss); } /* just to be sure */ ieee80211_stop_poll(sdata); ieee80211_led_assoc(local, 1); vif_cfg->assoc = 1; /* Enable ARP filtering */ if (vif_cfg->arp_addr_cnt) vif_changed |= BSS_CHANGED_ARP_FILTER; if (ieee80211_vif_is_mld(&sdata->vif)) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; if (!cbss || !(BIT(link_id) & ieee80211_vif_usable_links(&sdata->vif)) || assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; ieee80211_link_info_change_notify(sdata, link, changed[link_id]); ieee80211_recalc_smps(sdata, link); } ieee80211_vif_cfg_change_notify(sdata, vif_changed); } else { ieee80211_bss_info_change_notify(sdata, vif_changed | changed[0]); } ieee80211_recalc_ps(local); /* leave this here to not change ordering in non-MLO cases */ if (!ieee80211_vif_is_mld(&sdata->vif)) ieee80211_recalc_smps(sdata, &sdata->deflink); ieee80211_recalc_ps_vif(sdata); netif_carrier_on(sdata->dev); } static void ieee80211_ml_reconf_reset(struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgd_assoc_data *add_links_data = sdata->u.mgd.reconf.add_links_data; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->u.mgd.reconf.added_links | sdata->u.mgd.reconf.removed_links)) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.reconf.wk); sdata->u.mgd.reconf.added_links = 0; sdata->u.mgd.reconf.removed_links = 0; sdata->u.mgd.reconf.dialog_token = 0; if (add_links_data) { struct cfg80211_mlo_reconf_done_data done_data = {}; u8 link_id; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) done_data.links[link_id].bss = add_links_data->link[link_id].bss; cfg80211_mlo_reconf_add_done(sdata->dev, &done_data); kfree(sdata->u.mgd.reconf.add_links_data); sdata->u.mgd.reconf.add_links_data = NULL; } } static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, u16 stype, u16 reason, bool tx, u8 *frame_buf) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); unsigned int link_id; u64 changed = 0; struct ieee80211_prep_tx_info info = { .subtype = stype, .was_assoc = true, .link_id = ffs(sdata->vif.active_links) - 1, }; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!ap_sta)) return; if (WARN_ON_ONCE(tx && !frame_buf)) return; if (WARN_ON(!ifmgd->associated)) return; ieee80211_stop_poll(sdata); ifmgd->associated = false; if (tx) { bool tx_link_found = false; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; if (!ieee80211_vif_link_active(&sdata->vif, link_id)) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON_ONCE(!link)) continue; if (link->u.mgd.csa.blocked_tx) continue; tx_link_found = true; break; } tx = tx_link_found; } /* other links will be destroyed */ sdata->deflink.conf->bss = NULL; sdata->deflink.conf->epcs_support = false; sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; netif_carrier_off(sdata->dev); /* * if we want to get out of ps before disassoc (why?) we have * to do it before sending disassoc, as otherwise the null-packet * won't be valid. */ if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } local->ps_sdata = NULL; /* disable per-vif ps */ ieee80211_recalc_ps_vif(sdata); /* make sure ongoing transmission finishes */ synchronize_net(); /* * drop any frame before deauth/disassoc, this can be data or * management frame. Since we are disconnecting, we should not * insist sending these frames which can take time and delay * the disconnection and possible the roaming. */ ieee80211_flush_queues(local, sdata, true); if (tx) { drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, sdata->vif.cfg.ap_addr, stype, reason, true, frame_buf); /* flush out frame - make sure the deauth was actually sent */ ieee80211_flush_queues(local, sdata, false); drv_mgd_complete_tx(sdata->local, sdata, &info); } else if (frame_buf) { ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, sdata->vif.cfg.ap_addr, stype, reason, false, frame_buf); } /* clear AP addr only after building the needed mgmt frames */ eth_zero_addr(sdata->deflink.u.mgd.bssid); eth_zero_addr(sdata->vif.cfg.ap_addr); sdata->vif.cfg.ssid_len = 0; /* Remove TDLS peers */ __sta_info_flush(sdata, false, -1, ap_sta); if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { /* Only move the AP state */ sta_info_move_state(ap_sta, IEEE80211_STA_NONE); } else { /* Remove AP peer */ sta_info_flush(sdata, -1); } /* finally reset all BSS / config parameters */ if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= ieee80211_reset_erp_info(sdata); ieee80211_led_assoc(local, 0); changed |= BSS_CHANGED_ASSOC; sdata->vif.cfg.assoc = false; sdata->deflink.u.mgd.p2p_noa_index = -1; memset(&sdata->vif.bss_conf.p2p_noa_attr, 0, sizeof(sdata->vif.bss_conf.p2p_noa_attr)); /* on the next assoc, re-program HT/VHT parameters */ memset(&ifmgd->ht_capa, 0, sizeof(ifmgd->ht_capa)); memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); /* * reset MU-MIMO ownership and group data in default link, * if used, other links are destroyed */ memset(sdata->vif.bss_conf.mu_group.membership, 0, sizeof(sdata->vif.bss_conf.mu_group.membership)); memset(sdata->vif.bss_conf.mu_group.position, 0, sizeof(sdata->vif.bss_conf.mu_group.position)); if (!ieee80211_vif_is_mld(&sdata->vif)) changed |= BSS_CHANGED_MU_GROUPS; sdata->vif.bss_conf.mu_mimo_owner = false; sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL; timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); /* Disable ARP filtering */ if (sdata->vif.cfg.arp_addr_cnt) changed |= BSS_CHANGED_ARP_FILTER; sdata->vif.bss_conf.qos = false; if (!ieee80211_vif_is_mld(&sdata->vif)) { changed |= BSS_CHANGED_QOS; /* The BSSID (not really interesting) and HT changed */ changed |= BSS_CHANGED_BSSID | BSS_CHANGED_HT; ieee80211_bss_info_change_notify(sdata, changed); } else { ieee80211_vif_cfg_change_notify(sdata, changed); } if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { /* * After notifying the driver about the disassoc, * remove the ap sta. */ sta_info_flush(sdata, -1); } /* disassociated - set to defaults now */ ieee80211_set_wmm_default(&sdata->deflink, false, false); timer_delete_sync(&sdata->u.mgd.conn_mon_timer); timer_delete_sync(&sdata->u.mgd.bcn_mon_timer); timer_delete_sync(&sdata->u.mgd.timer); sdata->vif.bss_conf.dtim_period = 0; sdata->vif.bss_conf.beacon_rate = NULL; sdata->deflink.u.mgd.have_beacon = false; sdata->deflink.u.mgd.tracking_signal_avg = false; sdata->deflink.u.mgd.disable_wmm_tracking = false; ifmgd->flags = 0; for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; ieee80211_link_release_channel(link); } sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa.blocked_tx = false; sdata->deflink.u.mgd.csa.waiting_bcn = false; sdata->deflink.u.mgd.csa.ignored_same_chan = false; ieee80211_vif_unblock_queues_csa(sdata); /* existing TX TSPEC sessions no longer exist */ memset(ifmgd->tx_tspec, 0, sizeof(ifmgd->tx_tspec)); wiphy_delayed_work_cancel(local->hw.wiphy, &ifmgd->tx_tspec_wk); sdata->vif.bss_conf.power_type = IEEE80211_REG_UNSET_AP; sdata->vif.bss_conf.pwr_reduction = 0; ieee80211_clear_tpe(&sdata->vif.bss_conf.tpe); sdata->vif.cfg.eml_cap = 0; sdata->vif.cfg.eml_med_sync_delay = 0; sdata->vif.cfg.mld_capa_op = 0; memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->neg_ttlm_timeout_work); sdata->u.mgd.removed_links = 0; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->teardown_ttlm_work); /* if disconnection happens in the middle of the ML reconfiguration * flow, cfg80211 must called to release the BSS references obtained * when the flow started. */ ieee80211_ml_reconf_reset(sdata); ieee80211_vif_set_links(sdata, 0, 0); ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; ifmgd->epcs.enabled = false; ifmgd->epcs.dialog_token = 0; memset(ifmgd->userspace_selectors, 0, sizeof(ifmgd->userspace_selectors)); } static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (!(ifmgd->flags & IEEE80211_STA_CONNECTION_POLL)) return; __ieee80211_stop_poll(sdata); ieee80211_recalc_ps(local); if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; /* * We've received a probe response, but are not sure whether * we have or will be receiving any beacons or data, so let's * schedule the timers again, just in case. */ ieee80211_sta_reset_beacon_monitor(sdata); mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); } static void ieee80211_sta_tx_wmm_ac_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, u16 tx_time) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 tid; int ac; struct ieee80211_sta_tx_tspec *tx_tspec; unsigned long now = jiffies; if (!ieee80211_is_data_qos(hdr->frame_control)) return; tid = ieee80211_get_tid(hdr); ac = ieee80211_ac_from_tid(tid); tx_tspec = &ifmgd->tx_tspec[ac]; if (likely(!tx_tspec->admitted_time)) return; if (time_after(now, tx_tspec->time_slice_start + HZ)) { tx_tspec->consumed_tx_time = 0; tx_tspec->time_slice_start = now; if (tx_tspec->downgraded) { tx_tspec->action = TX_TSPEC_ACTION_STOP_DOWNGRADE; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &ifmgd->tx_tspec_wk, 0); } } if (tx_tspec->downgraded) return; tx_tspec->consumed_tx_time += tx_time; if (tx_tspec->consumed_tx_time >= tx_tspec->admitted_time) { tx_tspec->downgraded = true; tx_tspec->action = TX_TSPEC_ACTION_DOWNGRADE; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &ifmgd->tx_tspec_wk, 0); } } void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr, bool ack, u16 tx_time) { ieee80211_sta_tx_wmm_ac_notify(sdata, hdr, tx_time); if (!ieee80211_is_any_nullfunc(hdr->frame_control) || !sdata->u.mgd.probe_send_count) return; if (ack) sdata->u.mgd.probe_send_count = 0; else sdata->u.mgd.nullfunc_failed = true; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, (u32)-1, channel, ssid, ssid_len, NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); if (skb) ieee80211_tx_skb(sdata, skb); } static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 *dst = sdata->vif.cfg.ap_addr; u8 unicast_limit = max(1, max_probe_tries - 3); struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON(ieee80211_vif_is_mld(&sdata->vif))) return; /* * Try sending broadcast probe requests for the last three * probe requests after the first ones failed since some * buggy APs only support broadcast probe requests. */ if (ifmgd->probe_send_count >= unicast_limit) dst = NULL; /* * When the hardware reports an accurate Tx ACK status, it's * better to send a nullfunc frame instead of a probe request, * as it will kick us off the AP quickly if we aren't associated * anymore. The timeout will be reset if the frame is ACKed by * the AP. */ ifmgd->probe_send_count++; if (dst) { sta = sta_info_get(sdata, dst); if (!WARN_ON(!sta)) ieee80211_check_fast_rx(sta); } if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; ieee80211_send_nullfunc(sdata->local, sdata, false); } else { ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, sdata->vif.cfg.ssid, sdata->vif.cfg.ssid_len, sdata->deflink.conf->bss->channel); } ifmgd->probe_timeout = jiffies + msecs_to_jiffies(probe_wait_ms); run_again(sdata, ifmgd->probe_timeout); } static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, bool beacon) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool already = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON_ONCE(ieee80211_vif_is_mld(&sdata->vif))) return; if (!ieee80211_sdata_running(sdata)) return; if (!ifmgd->associated) return; if (sdata->local->tmp_channel || sdata->local->scanning) return; if (sdata->local->suspending) { /* reschedule after resume */ ieee80211_reset_ap_probe(sdata); return; } if (beacon) { mlme_dbg_ratelimited(sdata, "detected beacon loss from AP (missed %d beacons) - probing\n", beacon_loss_count); ieee80211_cqm_beacon_loss_notify(&sdata->vif, GFP_KERNEL); } /* * The driver/our work has already reported this event or the * connection monitoring has kicked in and we have already sent * a probe request. Or maybe the AP died and the driver keeps * reporting until we disassociate... * * In either case we have to ignore the current call to this * function (except for setting the correct probe reason bit) * because otherwise we would reset the timer every time and * never check whether we received a probe response! */ if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) already = true; ifmgd->flags |= IEEE80211_STA_CONNECTION_POLL; if (already) return; ieee80211_recalc_ps(sdata->local); ifmgd->probe_send_count = 0; ieee80211_mgd_probe_ap_send(sdata); } struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct cfg80211_bss *cbss; struct sk_buff *skb; const struct element *ssid; int ssid_len; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION || ieee80211_vif_is_mld(&sdata->vif))) return NULL; if (ifmgd->associated) cbss = sdata->deflink.conf->bss; else if (ifmgd->auth_data) cbss = ifmgd->auth_data->bss; else if (ifmgd->assoc_data && ifmgd->assoc_data->link[0].bss) cbss = ifmgd->assoc_data->link[0].bss; else return NULL; rcu_read_lock(); ssid = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); if (WARN_ONCE(!ssid || ssid->datalen > IEEE80211_MAX_SSID_LEN, "invalid SSID element (len=%d)", ssid ? ssid->datalen : -1)) ssid_len = 0; else ssid_len = ssid->datalen; skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, (u32) -1, cbss->channel, ssid->data, ssid_len, NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); rcu_read_unlock(); return skb; } EXPORT_SYMBOL(ieee80211_ap_probereq_get); static void ieee80211_report_disconnect(struct ieee80211_sub_if_data *sdata, const u8 *buf, size_t len, bool tx, u16 reason, bool reconnect) { struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = tx ? DEAUTH_TX_EVENT : DEAUTH_RX_EVENT, .u.mlme.reason = reason, }; if (tx) cfg80211_tx_mlme_mgmt(sdata->dev, buf, len, reconnect); else cfg80211_rx_mlme_mgmt(sdata->dev, buf, len); drv_event_callback(sdata->local, sdata, &event); } static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; if (!ifmgd->driver_disconnect) { unsigned int link_id; /* * AP is probably out of range (or not reachable for another * reason) so remove the bss structs for that AP. In the case * of multi-link, it's not clear that all of them really are * out of range, but if they weren't the driver likely would * have switched to just have a single link active? */ for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); if (!link || !link->conf->bss) continue; cfg80211_unlink_bss(local->hw.wiphy, link->conf->bss); link->conf->bss = NULL; } } ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, ifmgd->driver_disconnect ? WLAN_REASON_DEAUTH_LEAVING : WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, true, frame_buf); /* the other links will be destroyed */ sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa.waiting_bcn = false; sdata->deflink.u.mgd.csa.blocked_tx = false; ieee80211_vif_unblock_queues_csa(sdata); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, ifmgd->reconnect); ifmgd->reconnect = false; } static void ieee80211_beacon_connection_loss_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.beacon_connection_loss_work); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; if (ifmgd->connection_loss) { sdata_info(sdata, "Connection to AP %pM lost\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); ifmgd->connection_loss = false; } else if (ifmgd->driver_disconnect) { sdata_info(sdata, "Driver requested disconnection from AP %pM\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); ifmgd->driver_disconnect = false; } else { if (ifmgd->associated) sdata->deflink.u.mgd.beacon_loss_count++; ieee80211_mgd_probe_ap(sdata, true); } } static void ieee80211_csa_connection_drop_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.csa_connection_drop_work); __ieee80211_disconnect(sdata); } void ieee80211_beacon_loss(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_hw *hw = &sdata->local->hw; trace_api_beacon_loss(sdata); sdata->u.mgd.connection_loss = false; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_beacon_loss); void ieee80211_connection_loss(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata; struct ieee80211_hw *hw; KUNIT_STATIC_STUB_REDIRECT(ieee80211_connection_loss, vif); sdata = vif_to_sdata(vif); hw = &sdata->local->hw; trace_api_connection_loss(sdata); sdata->u.mgd.connection_loss = true; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_connection_loss); void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_hw *hw = &sdata->local->hw; trace_api_disconnect(sdata, reconnect); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; sdata->u.mgd.driver_disconnect = true; sdata->u.mgd.reconnect = reconnect; wiphy_work_queue(hw->wiphy, &sdata->u.mgd.beacon_connection_loss_work); } EXPORT_SYMBOL(ieee80211_disconnect); static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, bool assoc) { struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.auth_data = NULL; if (!assoc) { /* * we are not authenticated yet, the only timer that could be * running is the timeout for the authentication response which * which is not relevant anymore. */ timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, auth_data->ap_addr); /* other links are destroyed */ eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); } cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); kfree(auth_data); } enum assoc_status { ASSOC_SUCCESS, ASSOC_REJECTED, ASSOC_TIMEOUT, ASSOC_ABANDON, }; static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, enum assoc_status status) { struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->u.mgd.assoc_data = NULL; if (status != ASSOC_SUCCESS) { /* * we are not associated yet, the only timer that could be * running is the timeout for the association response which * which is not relevant anymore. */ timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, assoc_data->ap_addr); eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; sdata->vif.bss_conf.mu_mimo_owner = false; if (status != ASSOC_REJECTED) { struct cfg80211_assoc_failure data = { .timeout = status == ASSOC_TIMEOUT, }; int i; BUILD_BUG_ON(ARRAY_SIZE(data.bss) != ARRAY_SIZE(assoc_data->link)); for (i = 0; i < ARRAY_SIZE(data.bss); i++) data.bss[i] = assoc_data->link[i].bss; if (ieee80211_vif_is_mld(&sdata->vif)) data.ap_mld_addr = assoc_data->ap_addr; cfg80211_assoc_failure(sdata->dev, &data); } ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); } kfree(assoc_data); } static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; const struct element *challenge; u8 *pos; u32 tx_flags = 0; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, .link_id = auth_data->link_id, }; pos = mgmt->u.auth.variable; challenge = cfg80211_find_elem(WLAN_EID_CHALLENGE, pos, len - (pos - (u8 *)mgmt)); if (!challenge) return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata, &info); if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, (void *)challenge, challenge->datalen + sizeof(*challenge), auth_data->ap_addr, auth_data->ap_addr, auth_data->key, auth_data->key_len, auth_data->key_idx, tx_flags); } static bool ieee80211_mark_sta_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; const u8 *ap_addr = ifmgd->auth_data->ap_addr; struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata_info(sdata, "authenticated\n"); ifmgd->auth_data->done = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_ASSOC; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); /* move station state to auth */ sta = sta_info_get(sdata, ap_addr); if (!sta) { WARN_ONCE(1, "%s: STA %pM not found", sdata->name, ap_addr); return false; } if (sta_info_move_state(sta, IEEE80211_STA_AUTH)) { sdata_info(sdata, "failed moving %pM to auth\n", ap_addr); return false; } return true; } static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 auth_alg, auth_transaction, status_code; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, }; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 6) return; if (!ifmgd->auth_data || ifmgd->auth_data->done) return; if (!ether_addr_equal(ifmgd->auth_data->ap_addr, mgmt->bssid)) return; auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); info.link_id = ifmgd->auth_data->link_id; if (auth_alg != ifmgd->auth_data->algorithm || (auth_alg != WLAN_AUTH_SAE && auth_transaction != ifmgd->auth_data->expected_transaction) || (auth_alg == WLAN_AUTH_SAE && (auth_transaction < ifmgd->auth_data->expected_transaction || auth_transaction > 2))) { sdata_info(sdata, "%pM unexpected authentication state: alg %d (expected %d) transact %d (expected %d)\n", mgmt->sa, auth_alg, ifmgd->auth_data->algorithm, auth_transaction, ifmgd->auth_data->expected_transaction); goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); if (auth_alg == WLAN_AUTH_SAE && (status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED || (auth_transaction == 1 && (status_code == WLAN_STATUS_SAE_HASH_TO_ELEMENT || status_code == WLAN_STATUS_SAE_PK)))) { /* waiting for userspace now */ ifmgd->auth_data->waiting = true; ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); goto notify_driver; } sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); ieee80211_destroy_auth_data(sdata, false); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); goto notify_driver; } switch (ifmgd->auth_data->algorithm) { case WLAN_AUTH_OPEN: case WLAN_AUTH_LEAP: case WLAN_AUTH_FT: case WLAN_AUTH_SAE: case WLAN_AUTH_FILS_SK: case WLAN_AUTH_FILS_SK_PFS: case WLAN_AUTH_FILS_PK: break; case WLAN_AUTH_SHARED_KEY: if (ifmgd->auth_data->expected_transaction != 4) { ieee80211_auth_challenge(sdata, mgmt, len); /* need another frame */ return; } break; default: WARN_ONCE(1, "invalid auth alg %d", ifmgd->auth_data->algorithm); goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; info.success = 1; drv_event_callback(sdata->local, sdata, &event); if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || (auth_transaction == 2 && ifmgd->auth_data->expected_transaction == 2)) { if (!ieee80211_mark_sta_auth(sdata)) return; /* ignore frame -- wait for timeout */ } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 2) { sdata_info(sdata, "SAE peer confirmed\n"); ifmgd->auth_data->peer_confirmed = true; } cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); } #define case_WLAN(type) \ case WLAN_REASON_##type: return #type const char *ieee80211_get_reason_code_string(u16 reason_code) { switch (reason_code) { case_WLAN(UNSPECIFIED); case_WLAN(PREV_AUTH_NOT_VALID); case_WLAN(DEAUTH_LEAVING); case_WLAN(DISASSOC_DUE_TO_INACTIVITY); case_WLAN(DISASSOC_AP_BUSY); case_WLAN(CLASS2_FRAME_FROM_NONAUTH_STA); case_WLAN(CLASS3_FRAME_FROM_NONASSOC_STA); case_WLAN(DISASSOC_STA_HAS_LEFT); case_WLAN(STA_REQ_ASSOC_WITHOUT_AUTH); case_WLAN(DISASSOC_BAD_POWER); case_WLAN(DISASSOC_BAD_SUPP_CHAN); case_WLAN(INVALID_IE); case_WLAN(MIC_FAILURE); case_WLAN(4WAY_HANDSHAKE_TIMEOUT); case_WLAN(GROUP_KEY_HANDSHAKE_TIMEOUT); case_WLAN(IE_DIFFERENT); case_WLAN(INVALID_GROUP_CIPHER); case_WLAN(INVALID_PAIRWISE_CIPHER); case_WLAN(INVALID_AKMP); case_WLAN(UNSUPP_RSN_VERSION); case_WLAN(INVALID_RSN_IE_CAP); case_WLAN(IEEE8021X_FAILED); case_WLAN(CIPHER_SUITE_REJECTED); case_WLAN(DISASSOC_UNSPECIFIED_QOS); case_WLAN(DISASSOC_QAP_NO_BANDWIDTH); case_WLAN(DISASSOC_LOW_ACK); case_WLAN(DISASSOC_QAP_EXCEED_TXOP); case_WLAN(QSTA_LEAVE_QBSS); case_WLAN(QSTA_NOT_USE); case_WLAN(QSTA_REQUIRE_SETUP); case_WLAN(QSTA_TIMEOUT); case_WLAN(QSTA_CIPHER_NOT_SUPP); case_WLAN(MESH_PEER_CANCELED); case_WLAN(MESH_MAX_PEERS); case_WLAN(MESH_CONFIG); case_WLAN(MESH_CLOSE); case_WLAN(MESH_MAX_RETRIES); case_WLAN(MESH_CONFIRM_TIMEOUT); case_WLAN(MESH_INVALID_GTK); case_WLAN(MESH_INCONSISTENT_PARAM); case_WLAN(MESH_INVALID_SECURITY); case_WLAN(MESH_PATH_ERROR); case_WLAN(MESH_PATH_NOFORWARD); case_WLAN(MESH_PATH_DEST_UNREACHABLE); case_WLAN(MAC_EXISTS_IN_MBSS); case_WLAN(MESH_CHAN_REGULATORY); case_WLAN(MESH_CHAN); default: return "<unknown>"; } } static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 2) return; if (!ether_addr_equal(mgmt->bssid, mgmt->sa)) { ieee80211_tdls_handle_disconnect(sdata, mgmt->sa, reason_code); return; } if (ifmgd->associated && ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) { sdata_info(sdata, "deauthenticated from %pM (Reason: %u=%s)\n", sdata->vif.cfg.ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code, false); return; } if (ifmgd->assoc_data && ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->ap_addr)) { sdata_info(sdata, "deauthenticated from %pM while associating (Reason: %u=%s)\n", ifmgd->assoc_data->ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); return; } } static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u16 reason_code; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (len < 24 + 2) return; if (!ifmgd->associated || !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) return; reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); if (!ether_addr_equal(mgmt->bssid, mgmt->sa)) { ieee80211_tdls_handle_disconnect(sdata, mgmt->sa, reason_code); return; } sdata_info(sdata, "disassociated from %pM (Reason: %u=%s)\n", sdata->vif.cfg.ap_addr, reason_code, ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); ieee80211_report_disconnect(sdata, (u8 *)mgmt, len, false, reason_code, false); } static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct link_sta_info *link_sta, const struct ieee802_11_elems *elems) { const struct ieee80211_sta_he_cap *own_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (elems->ext_capab_len < 10) return false; if (!(elems->ext_capab[9] & WLAN_EXT_CAPA10_TWT_RESPONDER_SUPPORT)) return false; return link_sta->pub->he_cap.he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_RES && own_he_cap && (own_he_cap->he_cap_elem.mac_cap_info[0] & IEEE80211_HE_MAC_CAP0_TWT_REQ); } static u64 ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct ieee802_11_elems *elems) { bool twt = ieee80211_twt_req_supported(sdata, sband, link_sta, elems); if (link->conf->twt_requester != twt) { link->conf->twt_requester = twt; return BSS_CHANGED_TWT; } return 0; } static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta) { const struct ieee80211_sta_he_cap *own_he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); return bss_conf->he_support && (link_sta->pub->he_cap.he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BCAST_TWT) && own_he_cap && (own_he_cap->he_cap_elem.mac_cap_info[2] & IEEE80211_HE_MAC_CAP2_BCAST_TWT); } static void ieee80211_epcs_changed(struct ieee80211_sub_if_data *sdata, bool enabled) { /* in any case this is called, dialog token should be reset */ sdata->u.mgd.epcs.dialog_token = 0; if (sdata->u.mgd.epcs.enabled == enabled) return; sdata->u.mgd.epcs.enabled = enabled; cfg80211_epcs_changed(sdata->dev, enabled); } static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; u8 link_id; if (!sdata->u.mgd.epcs.enabled) return; lockdep_assert_wiphy(local->hw.wiphy); for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee802_11_elems *elems; struct ieee80211_link_data *link; const struct cfg80211_bss_ies *ies; bool ret; rcu_read_lock(); link = sdata_dereference(sdata->link[link_id], sdata); if (!link || !link->conf || !link->conf->bss) { rcu_read_unlock(); continue; } if (link->u.mgd.disable_wmm_tracking) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } ies = rcu_dereference(link->conf->bss->beacon_ies); if (!ies) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) { rcu_read_unlock(); ieee80211_set_wmm_default(link, false, false); continue; } ret = _ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set); kfree(elems); rcu_read_unlock(); if (!ret) { ieee80211_set_wmm_default(link, false, false); continue; } ieee80211_mgd_set_link_qos_params(link); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct cfg80211_bss *cbss, struct ieee80211_mgmt *mgmt, const u8 *elem_start, unsigned int elem_len, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data ?: sdata->u.mgd.reconf.add_links_data; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_local *local = sdata->local; unsigned int link_id = link->link_id; struct ieee80211_elems_parse_params parse_params = { .mode = link->u.mgd.conn.mode, .start = elem_start, .len = elem_len, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, }; bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; bool is_s1g = cbss->channel->band == NL80211_BAND_S1GHZ; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_supported_band *sband; struct ieee802_11_elems *elems; const __le16 prof_bss_param_ch_present = cpu_to_le16(IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT); u16 capab_info; bool ret; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return false; if (link_id == assoc_data->assoc_link_id) { capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); /* * we should not get to this flow unless the association was * successful, so set the status directly to success */ assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS; if (elems->ml_basic) { int bss_param_ch_cnt = ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); if (bss_param_ch_cnt < 0) { ret = false; goto out; } bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = link_id; } } else if (elems->parse_error & IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC || !elems->prof || !(elems->prof->control & prof_bss_param_ch_present)) { ret = false; goto out; } else { const u8 *ptr = elems->prof->variable + elems->prof->sta_info_len - 1; int bss_param_ch_cnt; /* * During parsing, we validated that these fields exist, * otherwise elems->prof would have been set to NULL. */ capab_info = get_unaligned_le16(ptr); assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2); bss_param_ch_cnt = ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(elems->prof); bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; bss_conf->bss_param_ch_cnt_link_id = link_id; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { link_info(link, "association response status code=%u\n", assoc_data->link[link_id].status); ret = true; goto out; } } if (!is_s1g && !elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); ret = false; goto out; } link->u.mgd.tdls_chan_switch_prohibited = elems->ext_capab && elems->ext_capab_len >= 5 && (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); /* * Some APs are erroneously not including some information in their * (re)association response frames. Try to recover by using the data * from the beacon or probe response. This seems to afflict mobile * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ if (!ieee80211_hw_check(&local->hw, STRICT) && !is_6ghz && ((assoc_data->wmm && !elems->wmm_param) || (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->ht_cap_elem || !elems->ht_operation)) || (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems *bss_elems; rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) bss_ies = kmemdup(ies, sizeof(*ies) + ies->len, GFP_ATOMIC); rcu_read_unlock(); if (!bss_ies) { ret = false; goto out; } parse_params.start = bss_ies->data; parse_params.len = bss_ies->len; parse_params.bss = cbss; parse_params.link_id = -1; bss_elems = ieee802_11_parse_elems_full(&parse_params); if (!bss_elems) { ret = false; goto out; } if (assoc_data->wmm && !elems->wmm_param && bss_elems->wmm_param) { elems->wmm_param = bss_elems->wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } /* * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ if (!elems->ht_cap_elem && bss_elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_cap_elem = bss_elems->ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } if (!elems->ht_operation && bss_elems->ht_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) { elems->ht_operation = bss_elems->ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } if (is_5ghz) { if (!elems->vht_cap_elem && bss_elems->vht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_cap_elem = bss_elems->vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } if (!elems->vht_operation && bss_elems->vht_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { elems->vht_operation = bss_elems->vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } } kfree(bss_elems); } /* * We previously checked these in the beacon/probe response, so * they should be present here. This is just a safety net. * Note that the ieee80211_config_bw() below would also check * for this (and more), but this has better error reporting. */ if (!is_6ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); ret = false; goto out; } if (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); ret = false; goto out; } /* check/update if AP changed anything in assoc response vs. scan */ if (ieee80211_config_bw(link, elems, link_id == assoc_data->assoc_link_id, changed, "assoc response")) { ret = false; goto out; } if (WARN_ON(!link->conf->chanreq.oper.chan)) { ret = false; goto out; } sband = local->hw.wiphy->bands[link->conf->chanreq.oper.chan->band]; /* Set up internal HT/VHT capabilities */ if (elems->ht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, elems->ht_cap_elem, link_sta); if (elems->vht_cap_elem && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { const struct ieee80211_vht_cap *bss_vht_cap = NULL; const struct cfg80211_bss_ies *ies; /* * Cisco AP module 9115 with FW 17.3 has a bug and sends a * too large maximum MPDU length in the association response * (indicating 12k) that it cannot actually process ... * Work around that. */ rcu_read_lock(); ies = rcu_dereference(cbss->ies); if (ies) { const struct element *elem; elem = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, ies->data, ies->len); if (elem && elem->datalen >= sizeof(*bss_vht_cap)) bss_vht_cap = (const void *)elem->data; } if (ieee80211_hw_check(&local->hw, STRICT) && (!bss_vht_cap || memcmp(bss_vht_cap, elems->vht_cap_elem, sizeof(*bss_vht_cap)))) { rcu_read_unlock(); ret = false; link_info(link, "VHT capabilities mismatch\n"); goto out; } ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, bss_vht_cap, link_sta); rcu_read_unlock(); } if (elems->he_operation && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE && elems->he_cap) { ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->he_6ghz_capa, link_sta); bss_conf->he_support = link_sta->pub->he_cap.has_he; if (elems->rsnx && elems->rsnx_len && (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_PROTECTED_TWT)) bss_conf->twt_protected = true; else bss_conf->twt_protected = false; *changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (elems->eht_operation && elems->eht_cap && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_EHT) { ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband, elems->he_cap, elems->he_cap_len, elems->eht_cap, elems->eht_cap_len, link_sta); bss_conf->eht_support = link_sta->pub->eht_cap.has_eht; bss_conf->epcs_support = bss_conf->eht_support && !!(elems->eht_cap->fixed.mac_cap_info[0] & IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS); /* EPCS might be already enabled but a new added link * does not support EPCS. This should not really happen * in practice. */ if (sdata->u.mgd.epcs.enabled && !bss_conf->epcs_support) ieee80211_epcs_teardown(sdata); } else { bss_conf->eht_support = false; bss_conf->epcs_support = false; } } else { bss_conf->he_support = false; bss_conf->twt_requester = false; bss_conf->twt_protected = false; bss_conf->eht_support = false; bss_conf->epcs_support = false; } bss_conf->twt_broadcast = ieee80211_twt_bcast_support(sdata, bss_conf, sband, link_sta); if (bss_conf->he_support) { bss_conf->he_bss_color.color = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_MASK); bss_conf->he_bss_color.partial = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR); bss_conf->he_bss_color.enabled = !le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); if (bss_conf->he_bss_color.enabled) *changed |= BSS_CHANGED_HE_BSS_COLOR; bss_conf->htc_trig_based_pkt_ext = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); bss_conf->frame_time_rts_th = le32_get_bits(elems->he_operation->he_oper_params, IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); bss_conf->uora_exists = !!elems->uora_element; if (elems->uora_element) bss_conf->uora_ocw_range = elems->uora_element[0]; ieee80211_he_op_ie_to_bss_conf(&sdata->vif, elems->he_operation); ieee80211_he_spr_ie_to_bss_conf(&sdata->vif, elems->he_spr); /* TODO: OPEN: what happens if BSS color disable is set? */ } if (cbss->transmitted_bss) { bss_conf->nontransmitted = true; ether_addr_copy(bss_conf->transmitter_bssid, cbss->transmitted_bss->bssid); bss_conf->bssid_indicator = cbss->max_bssid_indicator; bss_conf->bssid_index = cbss->bssid_index; } /* * Some APs, e.g. Netgear WNDR3700, report invalid HT operation data * in their association response, so ignore that data for our own * configuration. If it changed since the last beacon, we'll get the * next beacon and update then. */ /* * If an operating mode notification IE is present, override the * NSS calculation (that would be done in rate_control_rate_init()) * and use the # of streams from that element. */ if (elems->opmode_notif && !(*elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF)) { u8 nss; nss = *elems->opmode_notif & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; link_sta->pub->rx_nss = nss; } /* * Always handle WMM once after association regardless * of the first value the AP uses. Setting -1 here has * that effect because the AP values is an unsigned * 4-bit value. */ link->u.mgd.wmm_last_param_set = -1; link->u.mgd.mu_edca_last_param_set = -1; if (link->u.mgd.disable_wmm_tracking) { ieee80211_set_wmm_default(link, false, false); } else if (!ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) { /* still enable QoS since we might have HT/VHT */ ieee80211_set_wmm_default(link, false, true); /* disable WMM tracking in this case to disable * tracking WMM parameter changes in the beacon if * the parameters weren't actually valid. Doing so * avoids changing parameters very strangely when * the AP is going back and forth between valid and * invalid parameters. */ link->u.mgd.disable_wmm_tracking = true; } if (elems->max_idle_period_ie) { bss_conf->max_idle_period = le16_to_cpu(elems->max_idle_period_ie->max_idle_period); bss_conf->protected_keep_alive = !!(elems->max_idle_period_ie->idle_options & WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); *changed |= BSS_CHANGED_KEEP_ALIVE; } else { bss_conf->max_idle_period = 0; bss_conf->protected_keep_alive = false; } /* set assoc capability (AID was already set earlier), * ieee80211_set_associated() will tell the driver */ bss_conf->assoc_capability = capab_info; ret = true; out: kfree(elems); kfree(bss_ies); return ret; } static int ieee80211_mgd_setup_link_sta(struct ieee80211_link_data *link, struct sta_info *sta, struct link_sta_info *link_sta, struct cfg80211_bss *cbss) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss = (void *)cbss->priv; u32 rates = 0, basic_rates = 0; bool have_higher_than_11mbit = false; int min_rate = INT_MAX, min_rate_index = -1; struct ieee80211_supported_band *sband; memcpy(link_sta->addr, cbss->bssid, ETH_ALEN); memcpy(link_sta->pub->addr, cbss->bssid, ETH_ALEN); /* TODO: S1G Basic Rate Set is expressed elsewhere */ if (cbss->channel->band == NL80211_BAND_S1GHZ) { ieee80211_s1g_sta_rate_init(sta); return 0; } sband = local->hw.wiphy->bands[cbss->channel->band]; ieee80211_get_rates(sband, bss->supp_rates, bss->supp_rates_len, NULL, 0, &rates, &basic_rates, NULL, &have_higher_than_11mbit, &min_rate, &min_rate_index); /* * This used to be a workaround for basic rates missing * in the association response frame. Now that we no * longer use the basic rates from there, it probably * doesn't happen any more, but keep the workaround so * in case some *other* APs are buggy in different ways * we can connect -- with a warning. * Allow this workaround only in case the AP provided at least * one rate. */ if (min_rate_index < 0) { link_info(link, "No legacy rates in association response\n"); return -EINVAL; } else if (!basic_rates) { link_info(link, "No basic rates, using min rate instead\n"); basic_rates = BIT(min_rate_index); } if (rates) link_sta->pub->supp_rates[cbss->channel->band] = rates; else link_info(link, "No rates found, keeping mandatory only\n"); link->conf->basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ link->operating_11g_mode = sband->band == NL80211_BAND_2GHZ && have_higher_than_11mbit; return 0; } static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, struct cfg80211_bss *cbss) { struct ieee80211_he_mcs_nss_supp *he_mcs_nss_supp; const struct element *ht_cap_elem, *vht_cap_elem; const struct cfg80211_bss_ies *ies; const struct ieee80211_ht_cap *ht_cap; const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; const struct element *he_cap_elem; u16 mcs_80_map, mcs_160_map; int i, mcs_nss_size; bool support_160; u8 chains = 1; if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HT) return chains; ht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_CAPABILITY); if (ht_cap_elem && ht_cap_elem->datalen >= sizeof(*ht_cap)) { ht_cap = (void *)ht_cap_elem->data; chains = ieee80211_mcs_to_chains(&ht_cap->mcs); /* * TODO: use "Tx Maximum Number Spatial Streams Supported" and * "Tx Unequal Modulation Supported" fields. */ } if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_VHT) return chains; vht_cap_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); if (vht_cap_elem && vht_cap_elem->datalen >= sizeof(*vht_cap)) { u8 nss; u16 tx_mcs_map; vht_cap = (void *)vht_cap_elem->data; tx_mcs_map = le16_to_cpu(vht_cap->supp_mcs.tx_mcs_map); for (nss = 8; nss > 0; nss--) { if (((tx_mcs_map >> (2 * (nss - 1))) & 3) != IEEE80211_VHT_MCS_NOT_SUPPORTED) break; } /* TODO: use "Tx Highest Supported Long GI Data Rate" field? */ chains = max(chains, nss); } if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_HE) return chains; ies = rcu_dereference(cbss->ies); he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap)) return chains; /* skip one byte ext_tag_id */ he_cap = (void *)(he_cap_elem->data + 1); mcs_nss_size = ieee80211_he_mcs_nss_size(he_cap); /* invalid HE IE */ if (he_cap_elem->datalen < 1 + mcs_nss_size + sizeof(*he_cap)) return chains; /* mcs_nss is right after he_cap info */ he_mcs_nss_supp = (void *)(he_cap + 1); mcs_80_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_80); for (i = 7; i >= 0; i--) { u8 mcs_80 = mcs_80_map >> (2 * i) & 3; if (mcs_80 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { chains = max_t(u8, chains, i + 1); break; } } support_160 = he_cap->phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (!support_160) return chains; mcs_160_map = le16_to_cpu(he_mcs_nss_supp->tx_mcs_160); for (i = 7; i >= 0; i--) { u8 mcs_160 = mcs_160_map >> (2 * i) & 3; if (mcs_160 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { chains = max_t(u8, chains, i + 1); break; } } return chains; } static void ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_assoc_request *req, bool wmm_used, int link_id, struct ieee80211_conn_settings *conn) { struct ieee80211_sta_ht_cap sta_ht_cap = sband->ht_cap; bool is_5ghz = sband->band == NL80211_BAND_5GHZ; bool is_6ghz = sband->band == NL80211_BAND_6GHZ; const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; struct ieee80211_sta_vht_cap vht_cap; if (sband->band == NL80211_BAND_S1GHZ) { conn->mode = IEEE80211_CONN_MODE_S1G; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_dbg(sdata, "operating as S1G STA\n"); return; } conn->mode = IEEE80211_CONN_MODE_LEGACY; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); if (req && req->flags & ASSOC_REQ_DISABLE_HT) { mlme_link_id_dbg(sdata, link_id, "HT disabled by flag, limiting to legacy\n"); goto out; } if (!wmm_used) { mlme_link_id_dbg(sdata, link_id, "WMM/QoS not supported, limiting to legacy\n"); goto out; } if (req) { unsigned int i; for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) { if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP104) { netdev_info(sdata->dev, "WEP/TKIP use, limiting to legacy\n"); goto out; } } } if (!sta_ht_cap.ht_supported && !is_6ghz) { mlme_link_id_dbg(sdata, link_id, "HT not supported (and not on 6 GHz), limiting to legacy\n"); goto out; } /* HT is fine */ conn->mode = IEEE80211_CONN_MODE_HT; conn->bw_limit = sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_CONN_BW_LIMIT_40 : IEEE80211_CONN_BW_LIMIT_20; memcpy(&vht_cap, &sband->vht_cap, sizeof(vht_cap)); ieee80211_apply_vhtcap_overrides(sdata, &vht_cap); if (req && req->flags & ASSOC_REQ_DISABLE_VHT) { mlme_link_id_dbg(sdata, link_id, "VHT disabled by flag, limiting to HT\n"); goto out; } if (vht_cap.vht_supported && is_5ghz) { bool have_80mhz = false; unsigned int i; if (conn->bw_limit == IEEE80211_CONN_BW_LIMIT_20) { mlme_link_id_dbg(sdata, link_id, "no 40 MHz support on 5 GHz, limiting to HT\n"); goto out; } /* Allow VHT if at least one channel on the sband supports 80 MHz */ for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (!have_80mhz) { mlme_link_id_dbg(sdata, link_id, "no 80 MHz channel support on 5 GHz, limiting to HT\n"); goto out; } } else if (is_5ghz) { /* !vht_supported but on 5 GHz */ mlme_link_id_dbg(sdata, link_id, "no VHT support on 5 GHz, limiting to HT\n"); goto out; } /* VHT - if we have - is fine, including 80 MHz, check 160 below again */ if (sband->band != NL80211_BAND_2GHZ) { conn->mode = IEEE80211_CONN_MODE_VHT; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_160; } if (is_5ghz && !(vht_cap.cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ))) { conn->bw_limit = IEEE80211_CONN_BW_LIMIT_80; mlme_link_id_dbg(sdata, link_id, "no VHT 160 MHz capability on 5 GHz, limiting to 80 MHz"); } if (req && req->flags & ASSOC_REQ_DISABLE_HE) { mlme_link_id_dbg(sdata, link_id, "HE disabled by flag, limiting to HT/VHT\n"); goto out; } he_cap = ieee80211_get_he_iftype_cap_vif(sband, &sdata->vif); if (!he_cap) { WARN_ON(is_6ghz); mlme_link_id_dbg(sdata, link_id, "no HE support, limiting to HT/VHT\n"); goto out; } /* so we have HE */ conn->mode = IEEE80211_CONN_MODE_HE; /* check bandwidth */ switch (sband->band) { default: case NL80211_BAND_2GHZ: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) break; conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_link_id_dbg(sdata, link_id, "no 40 MHz HE cap in 2.4 GHz, limiting to 20 MHz\n"); break; case NL80211_BAND_5GHZ: if (!(he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)) { conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; mlme_link_id_dbg(sdata, link_id, "no 40/80 MHz HE cap in 5 GHz, limiting to 20 MHz\n"); break; } if (!(he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G)) { conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_80); mlme_link_id_dbg(sdata, link_id, "no 160 MHz HE cap in 5 GHz, limiting to 80 MHz\n"); } break; case NL80211_BAND_6GHZ: if (he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) break; conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_80); mlme_link_id_dbg(sdata, link_id, "no 160 MHz HE cap in 6 GHz, limiting to 80 MHz\n"); break; } if (req && req->flags & ASSOC_REQ_DISABLE_EHT) { mlme_link_id_dbg(sdata, link_id, "EHT disabled by flag, limiting to HE\n"); goto out; } eht_cap = ieee80211_get_eht_iftype_cap_vif(sband, &sdata->vif); if (!eht_cap) { mlme_link_id_dbg(sdata, link_id, "no EHT support, limiting to HE\n"); goto out; } /* we have EHT */ conn->mode = IEEE80211_CONN_MODE_EHT; /* check bandwidth */ if (is_6ghz && eht_cap->eht_cap_elem.phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) conn->bw_limit = IEEE80211_CONN_BW_LIMIT_320; else if (is_6ghz) mlme_link_id_dbg(sdata, link_id, "no EHT 320 MHz cap in 6 GHz, limiting to 160 MHz\n"); out: mlme_link_id_dbg(sdata, link_id, "determined local STA to be %s, BW limited to %d MHz\n", ieee80211_conn_mode_str(conn->mode), 20 * (1 << conn->bw_limit)); } static void ieee80211_determine_our_sta_mode_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_auth_request *req, bool wmm_used, struct ieee80211_conn_settings *conn) { ieee80211_determine_our_sta_mode(sdata, sband, NULL, wmm_used, req->link_id > 0 ? req->link_id : 0, conn); } static void ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct cfg80211_assoc_request *req, bool wmm_used, int link_id, struct ieee80211_conn_settings *conn) { struct ieee80211_conn_settings tmp; WARN_ON(!req); ieee80211_determine_our_sta_mode(sdata, sband, req, wmm_used, link_id, &tmp); conn->mode = min_t(enum ieee80211_conn_mode, conn->mode, tmp.mode); conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, tmp.bw_limit); } static enum ieee80211_ap_reg_power ieee80211_ap_power_type(u8 control) { switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; default: return IEEE80211_REG_UNSET_AP; } } static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, int link_id, struct cfg80211_bss *cbss, bool mlo, struct ieee80211_conn_settings *conn, unsigned long *userspace_selectors) { struct ieee80211_local *local = sdata->local; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef; struct ieee802_11_elems *elems; int ret; lockdep_assert_wiphy(local->hw.wiphy); rcu_read_lock(); elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id, &chanreq, &ap_chandef, userspace_selectors); if (IS_ERR(elems)) { rcu_read_unlock(); return PTR_ERR(elems); } if (mlo && !elems->ml_basic) { sdata_info(sdata, "Rejecting MLO as it is not supported by AP\n"); rcu_read_unlock(); kfree(elems); return -EINVAL; } if (link && is_6ghz && conn->mode >= IEEE80211_CONN_MODE_HE) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; if (elems->pwr_constr_elem) link->conf->pwr_reduction = *elems->pwr_constr_elem; he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); if (he_6ghz_oper) link->conf->power_type = ieee80211_ap_power_type(he_6ghz_oper->control); else link_info(link, "HE 6 GHz operation missing (on %d MHz), expect issues\n", cbss->channel->center_freq); link->conf->tpe = elems->tpe; ieee80211_rearrange_tpe(&link->conf->tpe, &ap_chandef, &chanreq.oper); } rcu_read_unlock(); /* the element data was RCU protected so no longer valid anyway */ kfree(elems); elems = NULL; if (!link) return 0; rcu_read_lock(); link->needed_rx_chains = min(ieee80211_max_rx_chains(link, cbss), local->rx_chains); rcu_read_unlock(); /* * If this fails (possibly due to channel context sharing * on incompatible channels, e.g. 80+80 and 160 sharing the * same control channel) try to use a smaller bandwidth. */ ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); /* don't downgrade for 5 and 10 MHz channels, though. */ if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || chanreq.oper.width == NL80211_CHAN_WIDTH_10) return ret; while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) { ieee80211_chanreq_downgrade(&chanreq, conn); ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); } return ret; } static bool ieee80211_get_dtim(const struct cfg80211_bss_ies *ies, u8 *dtim_count, u8 *dtim_period) { const u8 *tim_ie = cfg80211_find_ie(WLAN_EID_TIM, ies->data, ies->len); const u8 *idx_ie = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, ies->data, ies->len); const struct ieee80211_tim_ie *tim = NULL; const struct ieee80211_bssid_index *idx; bool valid = tim_ie && tim_ie[1] >= 2; if (valid) tim = (void *)(tim_ie + 2); if (dtim_count) *dtim_count = valid ? tim->dtim_count : 0; if (dtim_period) *dtim_period = valid ? tim->dtim_period : 0; /* Check if value is overridden by non-transmitted profile */ if (!idx_ie || idx_ie[1] < 3) return valid; idx = (void *)(idx_ie + 2); if (dtim_count) *dtim_count = idx->dtim_count; if (dtim_period) *dtim_period = idx->dtim_period; return true; } static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, struct ieee802_11_elems *elems, const u8 *elem_start, unsigned int elem_len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct ieee80211_local *local = sdata->local; unsigned int link_id; struct sta_info *sta; u64 changed[IEEE80211_MLD_MAX_NUM_LINKS] = {}; u16 valid_links = 0, dormant_links = 0; int err; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * station info was already allocated and inserted before * the association and should be available to us */ sta = sta_info_get(sdata, assoc_data->ap_addr); if (WARN_ON(!sta)) goto out_err; sta->sta.spp_amsdu = assoc_data->spp_amsdu; if (ieee80211_vif_is_mld(&sdata->vif)) { for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!assoc_data->link[link_id].bss) continue; valid_links |= BIT(link_id); if (assoc_data->link[link_id].disabled) dormant_links |= BIT(link_id); if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_allocate_link(sta, link_id); if (err) goto out_err; } } ieee80211_vif_set_links(sdata, valid_links, dormant_links); } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; struct link_sta_info *link_sta; if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) goto out_err; if (ieee80211_vif_is_mld(&sdata->vif)) link_info(link, "local address %pM, AP link address %pM%s\n", link->conf->addr, assoc_data->link[link_id].bss->bssid, link_id == assoc_data->assoc_link_id ? " (assoc)" : ""); link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) goto out_err; if (!link->u.mgd.have_beacon) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->beacon_ies); if (ies) link->u.mgd.have_beacon = true; else ies = rcu_dereference(cbss->ies); ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, &link->u.mgd.dtim_period); link->conf->beacon_int = cbss->beacon_interval; rcu_read_unlock(); } link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; if (link_id != assoc_data->assoc_link_id) { link->u.mgd.conn = assoc_data->link[link_id].conn; err = ieee80211_prep_channel(sdata, link, link_id, cbss, true, &link->u.mgd.conn, sdata->u.mgd.userspace_selectors); if (err) { link_info(link, "prep_channel failed\n"); goto out_err; } } err = ieee80211_mgd_setup_link_sta(link, sta, link_sta, assoc_data->link[link_id].bss); if (err) goto out_err; if (!ieee80211_assoc_config_link(link, link_sta, assoc_data->link[link_id].bss, mgmt, elem_start, elem_len, &changed[link_id])) goto out_err; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { valid_links &= ~BIT(link_id); ieee80211_sta_remove_link(sta, link_id); continue; } if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_activate_link(sta, link_id); if (err) goto out_err; } } /* links might have changed due to rejected ones, set them again */ ieee80211_vif_set_links(sdata, valid_links, dormant_links); rate_control_rate_init_all_links(sta); if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { set_sta_flag(sta, WLAN_STA_MFP); sta->sta.mfp = true; } else { sta->sta.mfp = false; } ieee80211_sta_set_max_amsdu_subframes(sta, elems->ext_capab, elems->ext_capab_len); sta->sta.wme = (elems->wmm_param || elems->s1g_capab) && local->hw.queues >= IEEE80211_NUM_ACS; err = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (!err && !(ifmgd->flags & IEEE80211_STA_CONTROL_PORT)) err = sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); if (err) { sdata_info(sdata, "failed to move station %pM to desired state\n", sta->sta.addr); WARN_ON(__sta_info_destroy(sta)); goto out_err; } if (sdata->wdev.use_4addr) drv_sta_set_4addr(local, sdata, &sta->sta, true); ieee80211_set_associated(sdata, assoc_data, changed); /* * If we're using 4-addr mode, let the AP know that we're * doing so, so that it can create the STA VLAN on its side */ if (ifmgd->use_4addr) ieee80211_send_4addr_nullfunc(local, sdata); /* * Start timer to probe the connection to the AP now. * Also start the timer that will detect beacon loss. */ ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); return true; out_err: eth_zero_addr(sdata->vif.cfg.ap_addr); return false; } static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; struct ieee80211_elems_parse_params parse_params = { .bss = NULL, .link_id = -1, .from_ap = true, }; struct ieee802_11_elems *elems; int ac; const u8 *elem_start; unsigned int elem_len; bool reassoc; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = ASSOC_EVENT, }; struct ieee80211_prep_tx_info info = {}; struct cfg80211_rx_assoc_resp_data resp = { .uapsd_queues = -1, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!assoc_data) return; info.link_id = assoc_data->assoc_link_id; parse_params.mode = assoc_data->link[assoc_data->assoc_link_id].conn.mode; if (!ether_addr_equal(assoc_data->ap_addr, mgmt->bssid) || !ether_addr_equal(assoc_data->ap_addr, mgmt->sa)) return; /* * AssocResp and ReassocResp have identical structure, so process both * of them in this function. */ if (len < 24 + 6) return; reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); if (assoc_data->s1g) elem_start = mgmt->u.s1g_assoc_resp.variable; else elem_start = mgmt->u.assoc_resp.variable; /* * Note: this may not be perfect, AP might misbehave - if * anyone needs to rely on perfect complete notification * with the exact right subtype, then we need to track what * we actually transmitted. */ info.subtype = reassoc ? IEEE80211_STYPE_REASSOC_REQ : IEEE80211_STYPE_ASSOC_REQ; if (assoc_data->fils_kek_len && fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0) return; elem_len = len - (elem_start - (u8 *)mgmt); parse_params.start = elem_start; parse_params.len = elem_len; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) goto notify_driver; if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); else if (assoc_data->s1g) aid = 0; /* TODO */ else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* * The 5 MSB of the AID field are reserved * (802.11-2016 9.4.1.8 AID field) */ aid &= 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", reassoc ? "Rea" : "A", assoc_data->ap_addr, capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14)))); ifmgd->broken_ap = false; if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && elems->timeout_int && elems->timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) { u32 tu, ms; cfg80211_assoc_comeback(sdata->dev, assoc_data->ap_addr, le32_to_cpu(elems->timeout_int->value)); tu = le32_to_cpu(elems->timeout_int->value); ms = tu * 1024 / 1000; sdata_info(sdata, "%pM rejected association temporarily; comeback duration %u TU (%u ms)\n", assoc_data->ap_addr, tu, ms); assoc_data->timeout = jiffies + msecs_to_jiffies(ms); assoc_data->timeout_started = true; assoc_data->comeback = true; if (ms > IEEE80211_ASSOC_TIMEOUT) run_again(sdata, assoc_data->timeout); goto notify_driver; } if (status_code != WLAN_STATUS_SUCCESS) { sdata_info(sdata, "%pM denied association (code=%d)\n", assoc_data->ap_addr, status_code); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { if (aid == 0 || aid > IEEE80211_MAX_AID) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); aid = 0; ifmgd->broken_ap = true; } if (ieee80211_vif_is_mld(&sdata->vif)) { struct ieee80211_mle_basic_common_info *common; if (!elems->ml_basic) { sdata_info(sdata, "MLO association with %pM but no (basic) multi-link element in response!\n", assoc_data->ap_addr); goto abandon_assoc; } common = (void *)elems->ml_basic->variable; if (memcmp(assoc_data->ap_addr, common->mld_mac_addr, ETH_ALEN)) { sdata_info(sdata, "AP MLD MAC address mismatch: got %pM expected %pM\n", common->mld_mac_addr, assoc_data->ap_addr); goto abandon_assoc; } sdata->vif.cfg.eml_cap = ieee80211_mle_get_eml_cap((const void *)elems->ml_basic); sdata->vif.cfg.eml_med_sync_delay = ieee80211_mle_get_eml_med_sync_delay((const void *)elems->ml_basic); sdata->vif.cfg.mld_capa_op = ieee80211_mle_get_mld_capa_op((const void *)elems->ml_basic); } sdata->vif.cfg.aid = aid; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); goto notify_driver; } event.u.mlme.status = MLME_SUCCESS; drv_event_callback(sdata->local, sdata, &event); sdata_info(sdata, "associated\n"); info.success = 1; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_link_data *link; if (!assoc_data->link[link_id].bss) continue; resp.links[link_id].bss = assoc_data->link[link_id].bss; ether_addr_copy(resp.links[link_id].addr, assoc_data->link[link_id].addr); resp.links[link_id].status = assoc_data->link[link_id].status; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; /* get uapsd queues configuration - same for all links */ resp.uapsd_queues = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) if (link->tx_conf[ac].uapsd) resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; } if (ieee80211_vif_is_mld(&sdata->vif)) { ether_addr_copy(ap_mld_addr, sdata->vif.cfg.ap_addr); resp.ap_mld_addr = ap_mld_addr; } ieee80211_destroy_assoc_data(sdata, status_code == WLAN_STATUS_SUCCESS ? ASSOC_SUCCESS : ASSOC_REJECTED); resp.buf = (u8 *)mgmt; resp.len = len; resp.req_ies = ifmgd->assoc_req_ies; resp.req_ies_len = ifmgd->assoc_req_ies_len; cfg80211_rx_assoc_resp(sdata->dev, &resp); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); kfree(elems); return; abandon_assoc: ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); goto notify_driver; } static void ieee80211_rx_bss_info(struct ieee80211_link_data *link, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_bss *bss; struct ieee80211_channel *channel; lockdep_assert_wiphy(sdata->local->hw.wiphy); channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, channel); if (bss) { link->conf->beacon_rate = bss->beacon_rate; ieee80211_rx_bss_put(local, bss); } } static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_link_data *link, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_if_managed *ifmgd; struct ieee80211_rx_status *rx_status = (void *) skb->cb; struct ieee80211_channel *channel; size_t baselen, len = skb->len; ifmgd = &sdata->u.mgd; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * According to Draft P802.11ax D6.0 clause 26.17.2.3.2: * "If a 6 GHz AP receives a Probe Request frame and responds with * a Probe Response frame [..], the Address 1 field of the Probe * Response frame shall be set to the broadcast address [..]" * So, on 6GHz band we should also accept broadcast responses. */ channel = ieee80211_get_channel(sdata->local->hw.wiphy, rx_status->freq); if (!channel) return; if (!ether_addr_equal(mgmt->da, sdata->vif.addr) && (channel->band != NL80211_BAND_6GHZ || !is_broadcast_ether_addr(mgmt->da))) return; /* ignore ProbeResp to foreign address */ baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt; if (baselen > len) return; ieee80211_rx_bss_info(link, mgmt, len, rx_status); if (ifmgd->associated && ether_addr_equal(mgmt->bssid, link->u.mgd.bssid)) ieee80211_reset_ap_probe(sdata); } /* * This is the canonical list of information elements we care about, * the filter code also gives us all changes to the Microsoft OUI * (00:50:F2) vendor IE which is used for WMM which we need to track, * as well as the DTPC IE (part of the Cisco OUI) used for signaling * changes to requested client power. * * We implement beacon filtering in software since that means we can * avoid processing the frame here and in cfg80211, and userspace * will not be able to tell whether the hardware supports it or not. * * XXX: This list needs to be dynamic -- userspace needs to be able to * add items it requires. It also needs to be able to tell us to * look out for other vendor IEs. */ static const u64 care_about_ies = (1ULL << WLAN_EID_COUNTRY) | (1ULL << WLAN_EID_ERP_INFO) | (1ULL << WLAN_EID_CHANNEL_SWITCH) | (1ULL << WLAN_EID_PWR_CONSTRAINT) | (1ULL << WLAN_EID_HT_CAPABILITY) | (1ULL << WLAN_EID_HT_OPERATION) | (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); static void ieee80211_handle_beacon_sig(struct ieee80211_link_data *link, struct ieee80211_if_managed *ifmgd, struct ieee80211_bss_conf *bss_conf, struct ieee80211_local *local, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; /* Track average RSSI from the Beacon frames of the current AP */ if (!link->u.mgd.tracking_signal_avg) { link->u.mgd.tracking_signal_avg = true; ewma_beacon_signal_init(&link->u.mgd.ave_beacon_signal); link->u.mgd.last_cqm_event_signal = 0; link->u.mgd.count_beacon_signal = 1; link->u.mgd.last_ave_beacon_signal = 0; } else { link->u.mgd.count_beacon_signal++; } ewma_beacon_signal_add(&link->u.mgd.ave_beacon_signal, -rx_status->signal); if (ifmgd->rssi_min_thold != ifmgd->rssi_max_thold && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_sig = link->u.mgd.last_ave_beacon_signal; struct ieee80211_event event = { .type = RSSI_EVENT, }; /* * if signal crosses either of the boundaries, invoke callback * with appropriate parameters */ if (sig > ifmgd->rssi_max_thold && (last_sig <= ifmgd->rssi_min_thold || last_sig == 0)) { link->u.mgd.last_ave_beacon_signal = sig; event.u.rssi.data = RSSI_EVENT_HIGH; drv_event_callback(local, sdata, &event); } else if (sig < ifmgd->rssi_min_thold && (last_sig >= ifmgd->rssi_max_thold || last_sig == 0)) { link->u.mgd.last_ave_beacon_signal = sig; event.u.rssi.data = RSSI_EVENT_LOW; drv_event_callback(local, sdata, &event); } } if (bss_conf->cqm_rssi_thold && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_event = link->u.mgd.last_cqm_event_signal; int thold = bss_conf->cqm_rssi_thold; int hyst = bss_conf->cqm_rssi_hyst; if (sig < thold && (last_event == 0 || sig < last_event - hyst)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, sig, GFP_KERNEL); } else if (sig > thold && (last_event == 0 || sig > last_event + hyst)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, sig, GFP_KERNEL); } } if (bss_conf->cqm_rssi_low && link->u.mgd.count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { int sig = -ewma_beacon_signal_read(&link->u.mgd.ave_beacon_signal); int last_event = link->u.mgd.last_cqm_event_signal; int low = bss_conf->cqm_rssi_low; int high = bss_conf->cqm_rssi_high; if (sig < low && (last_event == 0 || last_event >= low)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, sig, GFP_KERNEL); } else if (sig > high && (last_event == 0 || last_event <= high)) { link->u.mgd.last_cqm_event_signal = sig; ieee80211_cqm_rssi_notify( &sdata->vif, NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, sig, GFP_KERNEL); } } } static bool ieee80211_rx_our_beacon(const u8 *tx_bssid, struct cfg80211_bss *bss) { if (ether_addr_equal(tx_bssid, bss->bssid)) return true; if (!bss->transmitted_bss) return false; return ether_addr_equal(tx_bssid, bss->transmitted_bss->bssid); } static void ieee80211_ml_reconf_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.ml_reconf_work.work); u16 new_valid_links, new_active_links, new_dormant_links; int ret; if (!sdata->u.mgd.removed_links) return; sdata_info(sdata, "MLO Reconfiguration: work: valid=0x%x, removed=0x%x\n", sdata->vif.valid_links, sdata->u.mgd.removed_links); new_valid_links = sdata->vif.valid_links & ~sdata->u.mgd.removed_links; if (new_valid_links == sdata->vif.valid_links) return; if (!new_valid_links || !(new_valid_links & ~sdata->vif.dormant_links)) { sdata_info(sdata, "No valid links after reconfiguration\n"); ret = -EINVAL; goto out; } new_active_links = sdata->vif.active_links & ~sdata->u.mgd.removed_links; if (new_active_links != sdata->vif.active_links) { if (!new_active_links) new_active_links = BIT(ffs(new_valid_links & ~sdata->vif.dormant_links) - 1); ret = ieee80211_set_active_links(&sdata->vif, new_active_links); if (ret) { sdata_info(sdata, "Failed setting active links\n"); goto out; } } new_dormant_links = sdata->vif.dormant_links & ~sdata->u.mgd.removed_links; ret = ieee80211_vif_set_links(sdata, new_valid_links, new_dormant_links); if (ret) sdata_info(sdata, "Failed setting valid links\n"); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); out: if (!ret) cfg80211_links_removed(sdata->dev, sdata->u.mgd.removed_links); else __ieee80211_disconnect(sdata); sdata->u.mgd.removed_links = 0; } static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { const struct element *sub; unsigned long removed_links = 0; u16 link_removal_timeout[IEEE80211_MLD_MAX_NUM_LINKS] = {}; u8 link_id; u32 delay; if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_reconf) return; /* Directly parse the sub elements as the common information doesn't * hold any useful information. */ for_each_mle_subelement(sub, (const u8 *)elems->ml_reconf, elems->ml_reconf_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; u8 *pos = prof->variable; u16 control; if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) continue; if (!ieee80211_mle_reconf_sta_prof_size_ok(sub->data, sub->datalen)) return; control = le16_to_cpu(prof->control); link_id = control & IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID; removed_links |= BIT(link_id); /* the MAC address should not be included, but handle it */ if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT) pos += 6; /* According to Draft P802.11be_D3.0, the control should * include the AP Removal Timer present. If the AP Removal Timer * is not present assume immediate removal. */ if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) link_removal_timeout[link_id] = get_unaligned_le16(pos); } removed_links &= sdata->vif.valid_links; if (!removed_links) { /* In case the removal was cancelled, abort it */ if (sdata->u.mgd.removed_links) { sdata->u.mgd.removed_links = 0; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); } return; } delay = 0; for_each_set_bit(link_id, &removed_links, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_bss_conf *link_conf = sdata_dereference(sdata->vif.link_conf[link_id], sdata); u32 link_delay; if (!link_conf) { removed_links &= ~BIT(link_id); continue; } if (link_removal_timeout[link_id] < 1) link_delay = 0; else link_delay = link_conf->beacon_int * (link_removal_timeout[link_id] - 1); if (!delay) delay = link_delay; else delay = min(delay, link_delay); } sdata->u.mgd.removed_links = removed_links; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work, TU_TO_JIFFIES(delay)); } static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, u16 active_links, u16 dormant_links, u16 suspended_links) { u64 changed = 0; int ret; if (!active_links) { ret = -EINVAL; goto out; } /* If there is an active negotiated TTLM, it should be discarded by * the new negotiated/advertised TTLM. */ if (sdata->vif.neg_ttlm.valid) { memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); sdata->vif.suspended_links = 0; changed = BSS_CHANGED_MLD_TTLM; } if (sdata->vif.active_links != active_links) { /* usable links are affected when active_links are changed, * so notify the driver about the status change */ changed |= BSS_CHANGED_MLD_VALID_LINKS; active_links &= sdata->vif.active_links; if (!active_links) active_links = BIT(__ffs(sdata->vif.valid_links & ~dormant_links)); ret = ieee80211_set_active_links(&sdata->vif, active_links); if (ret) { sdata_info(sdata, "Failed to set TTLM active links\n"); goto out; } } ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, dormant_links); if (ret) { sdata_info(sdata, "Failed to set TTLM dormant links\n"); goto out; } sdata->vif.suspended_links = suspended_links; if (sdata->vif.suspended_links) changed |= BSS_CHANGED_MLD_TTLM; ieee80211_vif_cfg_change_notify(sdata, changed); out: if (ret) ieee80211_disconnect(&sdata->vif, false); return ret; } static void ieee80211_tid_to_link_map_work(struct wiphy *wiphy, struct wiphy_work *work) { u16 new_active_links, new_dormant_links; struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.ttlm_work.work); new_active_links = sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; new_dormant_links = ~sdata->u.mgd.ttlm_info.map & sdata->vif.valid_links; ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0); if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links, 0)) return; sdata->u.mgd.ttlm_info.active = true; sdata->u.mgd.ttlm_info.switch_time = 0; } static u16 ieee80211_get_ttlm(u8 bm_size, u8 *data) { if (bm_size == 1) return *data; else return get_unaligned_le16(data); } static int ieee80211_parse_adv_t2l(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ttlm_elem *ttlm, struct ieee80211_adv_ttlm_info *ttlm_info) { /* The element size was already validated in * ieee80211_tid_to_link_map_size_ok() */ u8 control, link_map_presence, map_size, tid; u8 *pos; memset(ttlm_info, 0, sizeof(*ttlm_info)); pos = (void *)ttlm->optional; control = ttlm->control; if ((control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) || !(control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT)) return 0; if ((control & IEEE80211_TTLM_CONTROL_DIRECTION) != IEEE80211_TTLM_DIRECTION_BOTH) { sdata_info(sdata, "Invalid advertised T2L map direction\n"); return -EINVAL; } link_map_presence = *pos; pos++; ttlm_info->switch_time = get_unaligned_le16(pos); /* Since ttlm_info->switch_time == 0 means no switch time, bump it * by 1. */ if (!ttlm_info->switch_time) ttlm_info->switch_time = 1; pos += 2; if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) { ttlm_info->duration = pos[0] | pos[1] << 8 | pos[2] << 16; pos += 3; } if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) map_size = 1; else map_size = 2; /* According to Draft P802.11be_D3.0 clause 35.3.7.1.7, an AP MLD shall * not advertise a TID-to-link mapping that does not map all TIDs to the * same link set, reject frame if not all links have mapping */ if (link_map_presence != 0xff) { sdata_info(sdata, "Invalid advertised T2L mapping presence indicator\n"); return -EINVAL; } ttlm_info->map = ieee80211_get_ttlm(map_size, pos); if (!ttlm_info->map) { sdata_info(sdata, "Invalid advertised T2L map for TID 0\n"); return -EINVAL; } pos += map_size; for (tid = 1; tid < 8; tid++) { u16 map = ieee80211_get_ttlm(map_size, pos); if (map != ttlm_info->map) { sdata_info(sdata, "Invalid advertised T2L map for tid %d\n", tid); return -EINVAL; } pos += map_size; } return 0; } static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, u64 beacon_ts) { u8 i; int ret; if (!ieee80211_vif_is_mld(&sdata->vif)) return; if (!elems->ttlm_num) { if (sdata->u.mgd.ttlm_info.switch_time) { /* if a planned TID-to-link mapping was cancelled - * abort it */ wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); } else if (sdata->u.mgd.ttlm_info.active) { /* if no TID-to-link element, set to default mapping in * which all TIDs are mapped to all setup links */ ret = ieee80211_vif_set_links(sdata, sdata->vif.valid_links, 0); if (ret) { sdata_info(sdata, "Failed setting valid/dormant links\n"); return; } ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); } memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); return; } for (i = 0; i < elems->ttlm_num; i++) { struct ieee80211_adv_ttlm_info ttlm_info; u32 res; res = ieee80211_parse_adv_t2l(sdata, elems->ttlm[i], &ttlm_info); if (res) { __ieee80211_disconnect(sdata); return; } if (ttlm_info.switch_time) { u16 beacon_ts_tu, st_tu, delay; u32 delay_jiffies; u64 mask; /* The t2l map switch time is indicated with a partial * TSF value (bits 10 to 25), get the partial beacon TS * as well, and calc the delay to the start time. */ mask = GENMASK_ULL(25, 10); beacon_ts_tu = (beacon_ts & mask) >> 10; st_tu = ttlm_info.switch_time; delay = st_tu - beacon_ts_tu; /* * If the switch time is far in the future, then it * could also be the previous switch still being * announced. * We can simply ignore it for now, if it is a future * switch the AP will continue to announce it anyway. */ if (delay > IEEE80211_ADV_TTLM_ST_UNDERFLOW) return; delay_jiffies = TU_TO_JIFFIES(delay); /* Link switching can take time, so schedule it * 100ms before to be ready on time */ if (delay_jiffies > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) delay_jiffies -= IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS; else delay_jiffies = 0; sdata->u.mgd.ttlm_info = ttlm_info; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work, delay_jiffies); return; } } } static void ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata, int reporting_link_id, struct ieee802_11_elems *elems) { const struct element *sta_profiles[IEEE80211_MLD_MAX_NUM_LINKS] = {}; ssize_t sta_profiles_len[IEEE80211_MLD_MAX_NUM_LINKS] = {}; const struct element *sub; const u8 *subelems; size_t subelems_len; u8 common_size; int link_id; if (!ieee80211_mle_size_ok((u8 *)elems->ml_basic, elems->ml_basic_len)) return; common_size = ieee80211_mle_common_size((u8 *)elems->ml_basic); subelems = (u8 *)elems->ml_basic + common_size; subelems_len = elems->ml_basic_len - common_size; for_each_element_id(sub, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE, subelems, subelems_len) { struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; struct ieee80211_link_data *link; ssize_t len; if (!ieee80211_mle_basic_sta_prof_size_ok(sub->data, sub->datalen)) continue; link_id = le16_get_bits(prof->control, IEEE80211_MLE_STA_CONTROL_LINK_ID); /* need a valid link ID, but also not our own, both AP bugs */ if (link_id == reporting_link_id || link_id >= IEEE80211_MLD_MAX_NUM_LINKS) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; len = cfg80211_defragment_element(sub, subelems, subelems_len, NULL, 0, IEEE80211_MLE_SUBELEM_FRAGMENT); if (WARN_ON(len < 0)) continue; sta_profiles[link_id] = sub; sta_profiles_len[link_id] = len; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_mle_per_sta_profile *prof; struct ieee802_11_elems *prof_elems; struct ieee80211_link_data *link; ssize_t len; if (link_id == reporting_link_id) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; if (!sta_profiles[link_id]) { prof_elems = NULL; goto handle; } /* we can defragment in-place, won't use the buffer again */ len = cfg80211_defragment_element(sta_profiles[link_id], subelems, subelems_len, (void *)sta_profiles[link_id], sta_profiles_len[link_id], IEEE80211_MLE_SUBELEM_FRAGMENT); if (WARN_ON(len != sta_profiles_len[link_id])) continue; prof = (void *)sta_profiles[link_id]; prof_elems = ieee802_11_parse_elems(prof->variable + (prof->sta_info_len - 1), len - (prof->sta_info_len - 1), false, NULL); /* memory allocation failed - let's hope that's transient */ if (!prof_elems) continue; handle: /* * FIXME: the timings here are obviously incorrect, * but only older Intel drivers seem to care, and * those don't have MLO. If you really need this, * the problem is having to calculate it with the * TSF offset etc. The device_timestamp is still * correct, of course. */ ieee80211_sta_process_chanswitch(link, 0, 0, elems, prof_elems, IEEE80211_CSA_SOURCE_OTHER_LINK); kfree(prof_elems); } } static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, const struct ieee802_11_elems *elems) { struct ieee80211_vif_cfg *cfg = &sdata->vif.cfg; static u8 zero_ssid[IEEE80211_MAX_SSID_LEN]; if (!elems->ssid) return false; /* hidden SSID: zero length */ if (elems->ssid_len == 0) return false; if (elems->ssid_len != cfg->ssid_len) return true; /* hidden SSID: zeroed out */ if (!memcmp(elems->ssid, zero_ssid, elems->ssid_len)) return false; return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; struct ieee80211_mgmt *mgmt = (void *) hdr; size_t baselen; struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; struct link_sta_info *link_sta; struct sta_info *sta; u64 changed = 0; bool erp_valid; u8 erp_value = 0; u32 ncrc = 0; u8 *bssid, *variable = mgmt->u.beacon.variable; u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; struct ieee80211_elems_parse_params parse_params = { .mode = link->u.mgd.conn.mode, .link_id = -1, .from_ap = true, }; lockdep_assert_wiphy(local->hw.wiphy); /* Process beacon from the current BSS */ bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { struct ieee80211_ext *ext = (void *) mgmt; variable = ext->u.s1g_beacon.variable + ieee80211_s1g_optional_len(ext->frame_control); } baselen = (u8 *) variable - (u8 *) mgmt; if (baselen > len) return; parse_params.start = variable; parse_params.len = len - baselen; rcu_read_lock(); chanctx_conf = rcu_dereference(bss_conf->chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return; } if (ieee80211_rx_status_to_khz(rx_status) != ieee80211_channel_to_khz(chanctx_conf->def.chan)) { rcu_read_unlock(); return; } chan = chanctx_conf->def.chan; rcu_read_unlock(); if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && !WARN_ON(ieee80211_vif_is_mld(&sdata->vif)) && ieee80211_rx_our_beacon(bssid, ifmgd->assoc_data->link[0].bss)) { parse_params.bss = ifmgd->assoc_data->link[0].bss; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return; ieee80211_rx_bss_info(link, mgmt, len, rx_status); if (elems->dtim_period) link->u.mgd.dtim_period = elems->dtim_period; link->u.mgd.have_beacon = true; ifmgd->assoc_data->need_beacon = false; if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) && !ieee80211_is_s1g_beacon(hdr->frame_control)) { bss_conf->sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); bss_conf->sync_device_ts = rx_status->device_timestamp; bss_conf->sync_dtim_count = elems->dtim_count; } if (elems->mbssid_config_ie) bss_conf->profile_periodicity = elems->mbssid_config_ie->profile_periodicity; else bss_conf->profile_periodicity = 0; if (elems->ext_capab_len >= 11 && (elems->ext_capab[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) bss_conf->ema_ap = true; else bss_conf->ema_ap = false; /* continue assoc process */ ifmgd->assoc_data->timeout = jiffies; ifmgd->assoc_data->timeout_started = true; run_again(sdata, ifmgd->assoc_data->timeout); kfree(elems); return; } if (!ifmgd->associated || !ieee80211_rx_our_beacon(bssid, bss_conf->bss)) return; bssid = link->u.mgd.bssid; if (!(rx_status->flag & RX_FLAG_NO_SIGNAL_VAL)) ieee80211_handle_beacon_sig(link, ifmgd, bss_conf, local, rx_status); if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) { mlme_dbg_ratelimited(sdata, "cancelling AP probe due to a received beacon\n"); ieee80211_reset_ap_probe(sdata); } /* * Push the beacon loss detection into the future since * we are processing a beacon from the AP just now. */ ieee80211_sta_reset_beacon_monitor(sdata); /* TODO: CRC urrently not calculated on S1G Beacon Compatibility * element (which carries the beacon interval). Don't forget to add a * bit to care_about_ies[] above if mac80211 is interested in a * changing S1G element. */ if (!ieee80211_is_s1g_beacon(hdr->frame_control)) ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); parse_params.bss = bss_conf->bss; parse_params.filter = care_about_ies; parse_params.crc = ncrc; elems = ieee802_11_parse_elems_full(&parse_params); if (!elems) return; if (rx_status->flag & RX_FLAG_DECRYPTED && ieee80211_mgd_ssid_mismatch(sdata, elems)) { sdata_info(sdata, "SSID mismatch for AP %pM, disconnect\n", sdata->vif.cfg.ap_addr); __ieee80211_disconnect(sdata); return; } ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } ieee80211_send_nullfunc(local, sdata, false); } else if (!local->pspolling && sdata->u.mgd.powersave) { local->pspolling = true; /* * Here is assumed that the driver will be * able to send ps-poll frame and receive a * response even though power save mode is * enabled, but some drivers might require * to disable power save here. This needs * to be investigated. */ ieee80211_send_pspoll(local, sdata); } } if (sdata->vif.p2p || sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) { struct ieee80211_p2p_noa_attr noa = {}; int ret; ret = cfg80211_get_p2p_attr(variable, len - baselen, IEEE80211_P2P_ATTR_ABSENCE_NOTICE, (u8 *) &noa, sizeof(noa)); if (ret >= 2) { if (link->u.mgd.p2p_noa_index != noa.index) { /* valid noa_attr and index changed */ link->u.mgd.p2p_noa_index = noa.index; memcpy(&bss_conf->p2p_noa_attr, &noa, sizeof(noa)); changed |= BSS_CHANGED_P2P_PS; /* * make sure we update all information, the CRC * mechanism doesn't look at P2P attributes. */ link->u.mgd.beacon_crc_valid = false; } } else if (link->u.mgd.p2p_noa_index != -1) { /* noa_attr not found and we had valid noa_attr before */ link->u.mgd.p2p_noa_index = -1; memset(&bss_conf->p2p_noa_attr, 0, sizeof(bss_conf->p2p_noa_attr)); changed |= BSS_CHANGED_P2P_PS; link->u.mgd.beacon_crc_valid = false; } } /* * Update beacon timing and dtim count on every beacon appearance. This * will allow the driver to use the most updated values. Do it before * comparing this one with last received beacon. * IMPORTANT: These parameters would possibly be out of sync by the time * the driver will use them. The synchronized view is currently * guaranteed only in certain callbacks. */ if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY) && !ieee80211_is_s1g_beacon(hdr->frame_control)) { bss_conf->sync_tsf = le64_to_cpu(mgmt->u.beacon.timestamp); bss_conf->sync_device_ts = rx_status->device_timestamp; bss_conf->sync_dtim_count = elems->dtim_count; } if ((ncrc == link->u.mgd.beacon_crc && link->u.mgd.beacon_crc_valid) || ieee80211_is_s1g_short_beacon(mgmt->frame_control)) goto free; link->u.mgd.beacon_crc = ncrc; link->u.mgd.beacon_crc_valid = true; ieee80211_rx_bss_info(link, mgmt, len, rx_status); ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, IEEE80211_CSA_SOURCE_BEACON); /* note that after this elems->ml_basic can no longer be used fully */ ieee80211_mgd_check_cross_link_csa(sdata, rx_status->link_id, elems); ieee80211_mgd_update_bss_param_ch_cnt(sdata, bss_conf, elems); if (!sdata->u.mgd.epcs.enabled && !link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* * If we haven't had a beacon before, tell the driver about the * DTIM period (and beacon timing if desired) now. */ if (!link->u.mgd.have_beacon) { /* a few bogus AP send dtim_period = 0 or no TIM IE */ bss_conf->dtim_period = elems->dtim_period ?: 1; changed |= BSS_CHANGED_BEACON_INFO; link->u.mgd.have_beacon = true; ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); } if (elems->erp_info) { erp_valid = true; erp_value = elems->erp_info[0]; } else { erp_valid = false; } if (!ieee80211_is_s1g_beacon(hdr->frame_control)) changed |= ieee80211_handle_bss_capability(link, le16_to_cpu(mgmt->u.beacon.capab_info), erp_valid, erp_value); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!sta)) { goto free; } link_sta = rcu_dereference_protected(sta->link[link->link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) { goto free; } if (WARN_ON(!bss_conf->chanreq.oper.chan)) goto free; sband = local->hw.wiphy->bands[bss_conf->chanreq.oper.chan->band]; changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (ieee80211_config_bw(link, elems, true, &changed, "beacon")) { ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); ieee80211_report_disconnect(sdata, deauth_buf, sizeof(deauth_buf), true, WLAN_REASON_DEAUTH_LEAVING, false); goto free; } if (elems->opmode_notif) ieee80211_vht_handle_opmode(sdata, link_sta, *elems->opmode_notif, rx_status->band); changed |= ieee80211_handle_pwr_constr(link, chan, mgmt, elems->country_elem, elems->country_elem_len, elems->pwr_constr_elem, elems->cisco_dtpc_elem); ieee80211_ml_reconfiguration(sdata, elems); ieee80211_process_adv_ttlm(sdata, elems, le64_to_cpu(mgmt->u.beacon.timestamp)); ieee80211_link_info_change_notify(sdata, link, changed); free: kfree(elems); } static void ieee80211_apply_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm neg_ttlm) { u16 new_active_links, new_dormant_links, new_suspended_links, map = 0; u8 i; for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) map |= neg_ttlm.downlink[i] | neg_ttlm.uplink[i]; /* If there is an active TTLM, unset previously suspended links */ if (sdata->vif.neg_ttlm.valid) sdata->vif.dormant_links &= ~sdata->vif.suspended_links; /* exclude links that are already disabled by advertised TTLM */ new_active_links = map & sdata->vif.valid_links & ~sdata->vif.dormant_links; new_suspended_links = (~map & sdata->vif.valid_links) & ~sdata->vif.dormant_links; new_dormant_links = sdata->vif.dormant_links | new_suspended_links; if (ieee80211_ttlm_set_links(sdata, new_active_links, new_dormant_links, new_suspended_links)) return; sdata->vif.neg_ttlm = neg_ttlm; sdata->vif.neg_ttlm.valid = true; } static void ieee80211_neg_ttlm_timeout_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.neg_ttlm_timeout_work.work); sdata_info(sdata, "No negotiated TTLM response from AP, disconnecting.\n"); __ieee80211_disconnect(sdata); } static void ieee80211_neg_ttlm_add_suggested_map(struct sk_buff *skb, struct ieee80211_neg_ttlm *neg_ttlm) { u8 i, direction[IEEE80211_TTLM_MAX_CNT]; if (memcmp(neg_ttlm->downlink, neg_ttlm->uplink, sizeof(neg_ttlm->downlink))) { direction[0] = IEEE80211_TTLM_DIRECTION_DOWN; direction[1] = IEEE80211_TTLM_DIRECTION_UP; } else { direction[0] = IEEE80211_TTLM_DIRECTION_BOTH; } for (i = 0; i < ARRAY_SIZE(direction); i++) { u8 tid, len, map_ind = 0, *len_pos, *map_ind_pos, *pos; __le16 map; len = sizeof(struct ieee80211_ttlm_elem) + 1 + 1; pos = skb_put(skb, len + 2); *pos++ = WLAN_EID_EXTENSION; len_pos = pos++; *pos++ = WLAN_EID_EXT_TID_TO_LINK_MAPPING; *pos++ = direction[i]; map_ind_pos = pos++; for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { map = direction[i] == IEEE80211_TTLM_DIRECTION_UP ? cpu_to_le16(neg_ttlm->uplink[tid]) : cpu_to_le16(neg_ttlm->downlink[tid]); if (!map) continue; len += 2; map_ind |= BIT(tid); skb_put_data(skb, &map, sizeof(map)); } *map_ind_pos = map_ind; *len_pos = len; if (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH) break; } } static void ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_neg_ttlm *neg_ttlm, u8 dialog_token) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_req); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_req.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; mgmt->u.action.u.ttlm_req.dialog_token = dialog_token; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); ieee80211_tx_skb(sdata, skb); } int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct cfg80211_ttlm_params *params) { struct ieee80211_neg_ttlm neg_ttlm = {}; u8 i; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->vif.cfg.mld_capa_op & IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP)) return -EINVAL; for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { if ((params->dlink[i] & ~sdata->vif.valid_links) || (params->ulink[i] & ~sdata->vif.valid_links)) return -EINVAL; neg_ttlm.downlink[i] = params->dlink[i]; neg_ttlm.uplink[i] = params->ulink[i]; } if (drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm) != NEG_TTLM_RES_ACCEPT) return -EINVAL; ieee80211_apply_neg_ttlm(sdata, neg_ttlm); sdata->u.mgd.dialog_token_alloc++; ieee80211_send_neg_ttlm_req(sdata, &sdata->vif.neg_ttlm, sdata->u.mgd.dialog_token_alloc); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work); wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work, IEEE80211_NEG_TTLM_REQ_TIMEOUT); return 0; } static void ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, enum ieee80211_neg_ttlm_res ttlm_res, u8 dialog_token, struct ieee80211_neg_ttlm *neg_ttlm) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; u16 status_code; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) return; skb_reserve(skb, local->tx_headroom); mgmt = skb_put_zero(skb, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_res.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_RES; mgmt->u.action.u.ttlm_res.dialog_token = dialog_token; switch (ttlm_res) { default: WARN_ON(1); fallthrough; case NEG_TTLM_RES_REJECT: status_code = WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; break; case NEG_TTLM_RES_ACCEPT: status_code = WLAN_STATUS_SUCCESS; break; case NEG_TTLM_RES_SUGGEST_PREFERRED: status_code = WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); break; } mgmt->u.action.u.ttlm_res.status_code = cpu_to_le16(status_code); ieee80211_tx_skb(sdata, skb); } static int ieee80211_parse_neg_ttlm(struct ieee80211_sub_if_data *sdata, const struct ieee80211_ttlm_elem *ttlm, struct ieee80211_neg_ttlm *neg_ttlm, u8 *direction) { u8 control, link_map_presence, map_size, tid; u8 *pos; /* The element size was already validated in * ieee80211_tid_to_link_map_size_ok() */ pos = (void *)ttlm->optional; control = ttlm->control; /* mapping switch time and expected duration fields are not expected * in case of negotiated TTLM */ if (control & (IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT | IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT)) { mlme_dbg(sdata, "Invalid TTLM element in negotiated TTLM request\n"); return -EINVAL; } if (control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP) { for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { neg_ttlm->downlink[tid] = sdata->vif.valid_links; neg_ttlm->uplink[tid] = sdata->vif.valid_links; } *direction = IEEE80211_TTLM_DIRECTION_BOTH; return 0; } *direction = u8_get_bits(control, IEEE80211_TTLM_CONTROL_DIRECTION); if (*direction != IEEE80211_TTLM_DIRECTION_DOWN && *direction != IEEE80211_TTLM_DIRECTION_UP && *direction != IEEE80211_TTLM_DIRECTION_BOTH) return -EINVAL; link_map_presence = *pos; pos++; if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) map_size = 1; else map_size = 2; for (tid = 0; tid < IEEE80211_TTLM_NUM_TIDS; tid++) { u16 map; if (link_map_presence & BIT(tid)) { map = ieee80211_get_ttlm(map_size, pos); if (!map) { mlme_dbg(sdata, "No active links for TID %d", tid); return -EINVAL; } } else { map = 0; } switch (*direction) { case IEEE80211_TTLM_DIRECTION_BOTH: neg_ttlm->downlink[tid] = map; neg_ttlm->uplink[tid] = map; break; case IEEE80211_TTLM_DIRECTION_DOWN: neg_ttlm->downlink[tid] = map; break; case IEEE80211_TTLM_DIRECTION_UP: neg_ttlm->uplink[tid] = map; break; default: return -EINVAL; } pos += map_size; } return 0; } void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { u8 dialog_token, direction[IEEE80211_TTLM_MAX_CNT] = {}, i; size_t ies_len; enum ieee80211_neg_ttlm_res ttlm_res = NEG_TTLM_RES_ACCEPT; struct ieee802_11_elems *elems = NULL; struct ieee80211_neg_ttlm neg_ttlm = {}; BUILD_BUG_ON(ARRAY_SIZE(direction) != ARRAY_SIZE(elems->ttlm)); if (!ieee80211_vif_is_mld(&sdata->vif)) return; dialog_token = mgmt->u.action.u.ttlm_req.dialog_token; ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.ttlm_req.variable); elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, ies_len, true, NULL); if (!elems) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } for (i = 0; i < elems->ttlm_num; i++) { if (ieee80211_parse_neg_ttlm(sdata, elems->ttlm[i], &neg_ttlm, &direction[i]) || (direction[i] == IEEE80211_TTLM_DIRECTION_BOTH && elems->ttlm_num != 1)) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } } if (!elems->ttlm_num || (elems->ttlm_num == 2 && direction[0] == direction[1])) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } for (i = 0; i < IEEE80211_TTLM_NUM_TIDS; i++) { if ((neg_ttlm.downlink[i] && (neg_ttlm.downlink[i] & ~sdata->vif.valid_links)) || (neg_ttlm.uplink[i] && (neg_ttlm.uplink[i] & ~sdata->vif.valid_links))) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; } } ttlm_res = drv_can_neg_ttlm(sdata->local, sdata, &neg_ttlm); if (ttlm_res != NEG_TTLM_RES_ACCEPT) goto out; ieee80211_apply_neg_ttlm(sdata, neg_ttlm); out: kfree(elems); ieee80211_send_neg_ttlm_res(sdata, ttlm_res, dialog_token, &neg_ttlm); } void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { if (!ieee80211_vif_is_mld(&sdata->vif) || mgmt->u.action.u.ttlm_req.dialog_token != sdata->u.mgd.dialog_token_alloc) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.neg_ttlm_timeout_work); /* MLD station sends a TID to link mapping request, mainly to handle * BTM (BSS transition management) request, in which case it needs to * restrict the active links set. * In this case it's not expected that the MLD AP will reject the * negotiated TTLM request. * This can be better implemented in the future, to handle request * rejections. */ if (le16_to_cpu(mgmt->u.action.u.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) __ieee80211_disconnect(sdata); } void ieee80211_process_ttlm_teardown(struct ieee80211_sub_if_data *sdata) { u16 new_dormant_links; if (!sdata->vif.neg_ttlm.valid) return; memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); new_dormant_links = sdata->vif.dormant_links & ~sdata->vif.suspended_links; sdata->vif.suspended_links = 0; ieee80211_vif_set_links(sdata, sdata->vif.valid_links, new_dormant_links); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_TTLM | BSS_CHANGED_MLD_VALID_LINKS); } static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.teardown_ttlm_work); ieee80211_process_ttlm_teardown(sdata); } void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int frame_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_tear_down); struct ieee80211_tx_info *info; skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = skb_put_zero(skb, frame_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ttlm_tear_down.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; info->status_data = IEEE80211_STATUS_TYPE_NEG_TTLM; ieee80211_tx_skb(sdata, skb); } EXPORT_SYMBOL(ieee80211_send_teardown_neg_ttlm); void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_rx_status *rx_status; struct ieee80211_hdr *hdr; u16 fc; lockdep_assert_wiphy(sdata->local->hw.wiphy); rx_status = (struct ieee80211_rx_status *) skb->cb; hdr = (struct ieee80211_hdr *) skb->data; fc = le16_to_cpu(hdr->frame_control); switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_S1G_BEACON: ieee80211_rx_mgmt_beacon(link, hdr, skb->len, rx_status); break; } } void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; struct ieee80211_mgmt *mgmt; u16 fc; int ies_len; lockdep_assert_wiphy(sdata->local->hw.wiphy); rx_status = (struct ieee80211_rx_status *) skb->cb; mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); if (rx_status->link_valid) { link = sdata_dereference(sdata->link[rx_status->link_id], sdata); if (!link) return; } switch (fc & IEEE80211_FCTL_STYPE) { case IEEE80211_STYPE_BEACON: ieee80211_rx_mgmt_beacon(link, (void *)mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_PROBE_RESP: ieee80211_rx_mgmt_probe_resp(link, skb); break; case IEEE80211_STYPE_AUTH: ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DEAUTH: ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_DISASSOC: ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ASSOC_RESP: case IEEE80211_STYPE_REASSOC_RESP: ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len); break; case IEEE80211_STYPE_ACTION: if (!sdata->u.mgd.associated || !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) break; switch (mgmt->u.action.category) { case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); if (ies_len < 0) break; /* CSA IE cannot be overridden, no need for BSSID */ elems = ieee802_11_parse_elems( mgmt->u.action.u.chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src = IEEE80211_CSA_SOURCE_PROT_ACTION; ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, src); } kfree(elems); break; case WLAN_CATEGORY_PUBLIC: case WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION: ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.ext_chan_switch.variable); if (ies_len < 0) break; /* * extended CSA IE can't be overridden, no need for * BSSID */ elems = ieee802_11_parse_elems( mgmt->u.action.u.ext_chan_switch.variable, ies_len, true, NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src; if (mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION) src = IEEE80211_CSA_SOURCE_PROT_ACTION; else src = IEEE80211_CSA_SOURCE_UNPROT_ACTION; /* for the handling code pretend it was an IE */ elems->ext_chansw_ie = &mgmt->u.action.u.ext_chan_switch.data; ieee80211_sta_process_chanswitch(link, rx_status->mactime, rx_status->device_timestamp, elems, elems, src); } kfree(elems); break; } break; } } static void ieee80211_sta_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.mgd.timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, u8 reason, bool tx) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, reason, tx, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, reason, false); } static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data = ifmgd->auth_data; u32 tx_flags = 0; u16 trans = 1; u16 status = 0; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON_ONCE(!auth_data)) return -EINVAL; auth_data->tries++; if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) { sdata_info(sdata, "authentication with %pM timed out\n", auth_data->ap_addr); /* * Most likely AP is not in the range so remove the * bss struct for that AP. */ cfg80211_unlink_bss(local->hw.wiphy, auth_data->bss); return -ETIMEDOUT; } if (auth_data->algorithm == WLAN_AUTH_SAE) info.duration = jiffies_to_msecs(IEEE80211_AUTH_TIMEOUT_SAE); info.link_id = auth_data->link_id; drv_mgd_prepare_tx(local, sdata, &info); sdata_info(sdata, "send auth to %pM (try %d/%d)\n", auth_data->ap_addr, auth_data->tries, IEEE80211_AUTH_MAX_TRIES); auth_data->expected_transaction = 2; if (auth_data->algorithm == WLAN_AUTH_SAE) { trans = auth_data->sae_trans; status = auth_data->sae_status; auth_data->expected_transaction = trans; } if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, trans, auth_data->algorithm, status, auth_data->data, auth_data->data_len, auth_data->ap_addr, auth_data->ap_addr, NULL, 0, 0, tx_flags); if (tx_flags == 0) { if (auth_data->algorithm == WLAN_AUTH_SAE) auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SAE; else auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT; } else { auth_data->timeout = round_jiffies_up(jiffies + IEEE80211_AUTH_TIMEOUT_LONG); } auth_data->timeout_started = true; run_again(sdata, auth_data->timeout); return 0; } static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; struct ieee80211_local *local = sdata->local; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); assoc_data->tries++; assoc_data->comeback = false; if (assoc_data->tries > IEEE80211_ASSOC_MAX_TRIES) { sdata_info(sdata, "association with %pM timed out\n", assoc_data->ap_addr); /* * Most likely AP is not in the range so remove the * bss struct for that AP. */ cfg80211_unlink_bss(local->hw.wiphy, assoc_data->link[assoc_data->assoc_link_id].bss); return -ETIMEDOUT; } sdata_info(sdata, "associate with %pM (try %d/%d)\n", assoc_data->ap_addr, assoc_data->tries, IEEE80211_ASSOC_MAX_TRIES); ret = ieee80211_send_assoc(sdata); if (ret) return ret; if (!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT; assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); } else { assoc_data->timeout = round_jiffies_up(jiffies + IEEE80211_ASSOC_TIMEOUT_LONG); assoc_data->timeout_started = true; run_again(sdata, assoc_data->timeout); } return 0; } void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, __le16 fc, bool acked) { struct ieee80211_local *local = sdata->local; sdata->u.mgd.status_fc = fc; sdata->u.mgd.status_acked = acked; sdata->u.mgd.status_received = true; wiphy_work_queue(local->hw.wiphy, &sdata->work); } void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifmgd->status_received) { __le16 fc = ifmgd->status_fc; bool status_acked = ifmgd->status_acked; ifmgd->status_received = false; if (ifmgd->auth_data && ieee80211_is_auth(fc)) { if (status_acked) { if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE) ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SAE; else ifmgd->auth_data->timeout = jiffies + IEEE80211_AUTH_TIMEOUT_SHORT; run_again(sdata, ifmgd->auth_data->timeout); } else { ifmgd->auth_data->timeout = jiffies - 1; } ifmgd->auth_data->timeout_started = true; } else if (ifmgd->assoc_data && !ifmgd->assoc_data->comeback && (ieee80211_is_assoc_req(fc) || ieee80211_is_reassoc_req(fc))) { /* * Update association timeout based on the TX status * for the (Re)Association Request frame. Skip this if * we have already processed a (Re)Association Response * frame that indicated need for association comeback * at a specific time in the future. This could happen * if the TX status information is delayed enough for * the response to be received and processed first. */ if (status_acked) { ifmgd->assoc_data->timeout = jiffies + IEEE80211_ASSOC_TIMEOUT_SHORT; run_again(sdata, ifmgd->assoc_data->timeout); } else { ifmgd->assoc_data->timeout = jiffies - 1; } ifmgd->assoc_data->timeout_started = true; } } if (ifmgd->auth_data && ifmgd->auth_data->timeout_started && time_after(jiffies, ifmgd->auth_data->timeout)) { if (ifmgd->auth_data->done || ifmgd->auth_data->waiting) { /* * ok ... we waited for assoc or continuation but * userspace didn't do it, so kill the auth data */ ieee80211_destroy_auth_data(sdata, false); } else if (ieee80211_auth(sdata)) { u8 ap_addr[ETH_ALEN]; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, .u.mlme.status = MLME_TIMEOUT, }; memcpy(ap_addr, ifmgd->auth_data->ap_addr, ETH_ALEN); ieee80211_destroy_auth_data(sdata, false); cfg80211_auth_timeout(sdata->dev, ap_addr); drv_event_callback(sdata->local, sdata, &event); } } else if (ifmgd->auth_data && ifmgd->auth_data->timeout_started) run_again(sdata, ifmgd->auth_data->timeout); if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started && time_after(jiffies, ifmgd->assoc_data->timeout)) { if ((ifmgd->assoc_data->need_beacon && !sdata->deflink.u.mgd.have_beacon) || ieee80211_do_assoc(sdata)) { struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = ASSOC_EVENT, .u.mlme.status = MLME_TIMEOUT, }; ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); drv_event_callback(sdata->local, sdata, &event); } } else if (ifmgd->assoc_data && ifmgd->assoc_data->timeout_started) run_again(sdata, ifmgd->assoc_data->timeout); if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL && ifmgd->associated) { u8 *bssid = sdata->deflink.u.mgd.bssid; int max_tries; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) max_tries = max_nullfunc_tries; else max_tries = max_probe_tries; /* ACK received for nullfunc probing frame */ if (!ifmgd->probe_send_count) ieee80211_reset_ap_probe(sdata); else if (ifmgd->nullfunc_failed) { if (ifmgd->probe_send_count < max_tries) { mlme_dbg(sdata, "No ack for nullfunc frame to AP %pM, try %d/%i\n", bssid, ifmgd->probe_send_count, max_tries); ieee80211_mgd_probe_ap_send(sdata); } else { mlme_dbg(sdata, "No ack for nullfunc frame to AP %pM, disconnecting.\n", bssid); ieee80211_sta_connection_lost(sdata, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } } else if (time_is_after_jiffies(ifmgd->probe_timeout)) run_again(sdata, ifmgd->probe_timeout); else if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { mlme_dbg(sdata, "Failed to send nullfunc to AP %pM after %dms, disconnecting\n", bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } else if (ifmgd->probe_send_count < max_tries) { mlme_dbg(sdata, "No probe response from AP %pM after %dms, try %d/%i\n", bssid, probe_wait_ms, ifmgd->probe_send_count, max_tries); ieee80211_mgd_probe_ap_send(sdata); } else { /* * We actually lost the connection ... or did we? * Let's make sure! */ mlme_dbg(sdata, "No probe response from AP %pM after %dms, disconnecting.\n", bssid, probe_wait_ms); ieee80211_sta_connection_lost(sdata, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false); } } } static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.mgd.bcn_mon_timer); if (WARN_ON(ieee80211_vif_is_mld(&sdata->vif))) return; if (sdata->vif.bss_conf.csa_active && !sdata->deflink.u.mgd.csa.waiting_bcn) return; if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) return; sdata->u.mgd.connection_loss = false; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.beacon_connection_loss_work); } static void ieee80211_sta_conn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = from_timer(sdata, t, u.mgd.conn_mon_timer); struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; struct sta_info *sta; unsigned long timeout; if (WARN_ON(ieee80211_vif_is_mld(&sdata->vif))) return; if (sdata->vif.bss_conf.csa_active && !sdata->deflink.u.mgd.csa.waiting_bcn) return; sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (!sta) return; timeout = sta->deflink.status_stats.last_ack; if (time_before(sta->deflink.status_stats.last_ack, sta->deflink.rx_stats.last_rx)) timeout = sta->deflink.rx_stats.last_rx; timeout += IEEE80211_CONNECTION_IDLE_TIME; /* If timeout is after now, then update timer to fire at * the later date, but do not actually probe at this time. */ if (time_is_after_jiffies(timeout)) { mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(timeout)); return; } wiphy_work_queue(local->hw.wiphy, &sdata->u.mgd.monitor_work); } static void ieee80211_sta_monitor_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.monitor_work); ieee80211_mgd_probe_ap(sdata, false); } static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) { if (sdata->vif.type == NL80211_IFTYPE_STATION) { __ieee80211_stop_poll(sdata); /* let's probe the connection once */ if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.monitor_work); } } #ifdef CONFIG_PM void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (ifmgd->auth_data || ifmgd->assoc_data) { const u8 *ap_addr = ifmgd->auth_data ? ifmgd->auth_data->ap_addr : ifmgd->assoc_data->ap_addr; /* * If we are trying to authenticate / associate while suspending, * cfg80211 won't know and won't actually abort those attempts, * thus we need to do that ourselves. */ ieee80211_send_deauth_disassoc(sdata, ap_addr, ap_addr, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, false, frame_buf); if (ifmgd->assoc_data) ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, false); cfg80211_tx_mlme_mgmt(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN, false); } /* This is a bit of a hack - we should find a better and more generic * solution to this. Normally when suspending, cfg80211 will in fact * deauthenticate. However, it doesn't (and cannot) stop an ongoing * auth (not so important) or assoc (this is the problem) process. * * As a consequence, it can happen that we are in the process of both * associating and suspending, and receive an association response * after cfg80211 has checked if it needs to disconnect, but before * we actually set the flag to drop incoming frames. This will then * cause the workqueue flush to process the association response in * the suspend, resulting in a successful association just before it * tries to remove the interface from the driver, which now though * has a channel context assigned ... this results in issues. * * To work around this (for now) simply deauth here again if we're * now connected. */ if (ifmgd->associated && !sdata->local->wowlan) { u8 bssid[ETH_ALEN]; struct cfg80211_deauth_request req = { .reason_code = WLAN_REASON_DEAUTH_LEAVING, .bssid = bssid, }; memcpy(bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); ieee80211_mgd_deauth(sdata, &req); } } #endif void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ifmgd->associated) return; if (sdata->flags & IEEE80211_SDATA_DISCONNECT_RESUME) { sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_RESUME; mlme_dbg(sdata, "driver requested disconnect after resume\n"); ieee80211_sta_connection_lost(sdata, WLAN_REASON_UNSPECIFIED, true); return; } if (sdata->flags & IEEE80211_SDATA_DISCONNECT_HW_RESTART) { sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_HW_RESTART; mlme_dbg(sdata, "driver requested disconnect after hardware restart\n"); ieee80211_sta_connection_lost(sdata, WLAN_REASON_UNSPECIFIED, true); return; } } static void ieee80211_request_smps_mgd_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, u.mgd.request_smps_work); __ieee80211_request_smps_mgd(link->sdata, link, link->u.mgd.driver_smps_mode); } static void ieee80211_ml_sta_reconf_timeout(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_sub_if_data *sdata = container_of(work, struct ieee80211_sub_if_data, u.mgd.reconf.wk.work); if (!sdata->u.mgd.reconf.added_links && !sdata->u.mgd.reconf.removed_links) return; sdata_info(sdata, "mlo: reconf: timeout: added=0x%x, removed=0x%x\n", sdata->u.mgd.reconf.added_links, sdata->u.mgd.reconf.removed_links); __ieee80211_disconnect(sdata); } /* interface setup */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; wiphy_work_init(&ifmgd->monitor_work, ieee80211_sta_monitor_work); wiphy_work_init(&ifmgd->beacon_connection_loss_work, ieee80211_beacon_connection_loss_work); wiphy_work_init(&ifmgd->csa_connection_drop_work, ieee80211_csa_connection_drop_work); wiphy_delayed_work_init(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); wiphy_delayed_work_init(&ifmgd->ml_reconf_work, ieee80211_ml_reconf_work); wiphy_delayed_work_init(&ifmgd->reconf.wk, ieee80211_ml_sta_reconf_timeout); timer_setup(&ifmgd->timer, ieee80211_sta_timer, 0); timer_setup(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, 0); timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); wiphy_delayed_work_init(&ifmgd->tx_tspec_wk, ieee80211_sta_handle_tspec_ac_params_wk); wiphy_delayed_work_init(&ifmgd->ttlm_work, ieee80211_tid_to_link_map_work); wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work, ieee80211_neg_ttlm_timeout_work); wiphy_work_init(&ifmgd->teardown_ttlm_work, ieee80211_teardown_ttlm_work); ifmgd->flags = 0; ifmgd->powersave = sdata->wdev.ps; ifmgd->uapsd_queues = sdata->local->hw.uapsd_queues; ifmgd->uapsd_max_sp_len = sdata->local->hw.uapsd_max_sp_len; /* Setup TDLS data */ spin_lock_init(&ifmgd->teardown_lock); ifmgd->teardown_skb = NULL; ifmgd->orig_teardown_skb = NULL; ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; } static void ieee80211_recalc_smps_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_link_data *link = container_of(work, struct ieee80211_link_data, u.mgd.recalc_smps); ieee80211_recalc_smps(link->sdata, link); } void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; unsigned int link_id = link->link_id; link->u.mgd.p2p_noa_index = -1; link->conf->bssid = link->u.mgd.bssid; link->smps_mode = IEEE80211_SMPS_OFF; wiphy_work_init(&link->u.mgd.request_smps_work, ieee80211_request_smps_mgd_work); wiphy_work_init(&link->u.mgd.recalc_smps, ieee80211_recalc_smps_work); if (local->hw.wiphy->features & NL80211_FEATURE_DYNAMIC_SMPS) link->u.mgd.req_smps = IEEE80211_SMPS_AUTOMATIC; else link->u.mgd.req_smps = IEEE80211_SMPS_OFF; wiphy_delayed_work_init(&link->u.mgd.csa.switch_work, ieee80211_csa_switch_work); ieee80211_clear_tpe(&link->conf->tpe); if (sdata->u.mgd.assoc_data) ether_addr_copy(link->conf->addr, sdata->u.mgd.assoc_data->link[link_id].addr); else if (sdata->u.mgd.reconf.add_links_data) ether_addr_copy(link->conf->addr, sdata->u.mgd.reconf.add_links_data->link[link_id].addr); else if (!is_valid_ether_addr(link->conf->addr)) eth_random_addr(link->conf->addr); } /* scan finished notification */ void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; /* Restart STA timers */ rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) ieee80211_restart_sta_timer(sdata); } rcu_read_unlock(); } static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, s8 link_id, const u8 *ap_mld_addr, bool assoc, struct ieee80211_conn_settings *conn, bool override, unsigned long *userspace_selectors) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_link_data *link; bool have_sta = false; bool mlo; int err; if (link_id >= 0) { mlo = true; if (WARN_ON(!ap_mld_addr)) return -EINVAL; err = ieee80211_vif_set_links(sdata, BIT(link_id), 0); } else { if (WARN_ON(ap_mld_addr)) return -EINVAL; ap_mld_addr = cbss->bssid; err = ieee80211_vif_set_links(sdata, 0, 0); link_id = 0; mlo = false; } if (err) return err; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) { err = -ENOLINK; goto out_err; } if (WARN_ON(!ifmgd->auth_data && !ifmgd->assoc_data)) { err = -EINVAL; goto out_err; } /* If a reconfig is happening, bail out */ if (local->in_reconfig) { err = -EBUSY; goto out_err; } if (assoc) { rcu_read_lock(); have_sta = sta_info_get(sdata, ap_mld_addr); rcu_read_unlock(); } if (!have_sta) { if (mlo) new_sta = sta_info_alloc_with_link(sdata, ap_mld_addr, link_id, cbss->bssid, GFP_KERNEL); else new_sta = sta_info_alloc(sdata, ap_mld_addr, GFP_KERNEL); if (!new_sta) { err = -ENOMEM; goto out_err; } new_sta->sta.mlo = mlo; } /* * Set up the information for the new channel before setting the * new channel. We can't - completely race-free - change the basic * rates bitmap and the channel (sband) that it refers to, but if * we set it up before we at least avoid calling into the driver's * bss_info_changed() method with invalid information (since we do * call that from changing the channel - only for IDLE and perhaps * some others, but ...). * * So to avoid that, just set up all the new information before the * channel, but tell the driver to apply it only afterwards, since * it might need the new channel for that. */ if (new_sta) { const struct cfg80211_bss_ies *ies; struct link_sta_info *link_sta; rcu_read_lock(); link_sta = rcu_dereference(new_sta->link[link_id]); if (WARN_ON(!link_sta)) { rcu_read_unlock(); sta_info_free(local, new_sta); err = -EINVAL; goto out_err; } err = ieee80211_mgd_setup_link_sta(link, new_sta, link_sta, cbss); if (err) { rcu_read_unlock(); sta_info_free(local, new_sta); goto out_err; } memcpy(link->u.mgd.bssid, cbss->bssid, ETH_ALEN); /* set timing information */ link->conf->beacon_int = cbss->beacon_interval; ies = rcu_dereference(cbss->beacon_ies); if (ies) { link->conf->sync_tsf = ies->tsf; link->conf->sync_device_ts = bss->device_ts_beacon; ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, NULL); } else if (!ieee80211_hw_check(&sdata->local->hw, TIMING_BEACON_ONLY)) { ies = rcu_dereference(cbss->proberesp_ies); /* must be non-NULL since beacon IEs were NULL */ link->conf->sync_tsf = ies->tsf; link->conf->sync_device_ts = bss->device_ts_presp; link->conf->sync_dtim_count = 0; } else { link->conf->sync_tsf = 0; link->conf->sync_device_ts = 0; link->conf->sync_dtim_count = 0; } rcu_read_unlock(); } if (new_sta || override) { /* * Only set this if we're also going to calculate the AP * settings etc., otherwise this was set before in a * previous call. Note override is set to %true in assoc * if the settings were changed. */ link->u.mgd.conn = *conn; err = ieee80211_prep_channel(sdata, link, link->link_id, cbss, mlo, &link->u.mgd.conn, userspace_selectors); if (err) { if (new_sta) sta_info_free(local, new_sta); goto out_err; } /* pass out for use in assoc */ *conn = link->u.mgd.conn; } if (new_sta) { /* * tell driver about BSSID, basic rates and timing * this was set up above, before setting the channel */ ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BSSID | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BEACON_INT); if (assoc) sta_info_pre_move_state(new_sta, IEEE80211_STA_AUTH); err = sta_info_insert(new_sta); new_sta = NULL; if (err) { sdata_info(sdata, "failed to insert STA entry for the AP (error %d)\n", err); goto out_release_chan; } } else WARN_ON_ONCE(!ether_addr_equal(link->u.mgd.bssid, cbss->bssid)); /* Cancel scan to ensure that nothing interferes with connection */ if (local->scanning) ieee80211_scan_cancel(local); return 0; out_release_chan: ieee80211_link_release_channel(link); out_err: ieee80211_vif_set_links(sdata, 0, 0); return err; } static bool ieee80211_mgd_csa_present(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *ies, u8 cur_channel, bool ignore_ecsa) { const struct element *csa_elem, *ecsa_elem; struct ieee80211_channel_sw_ie *csa = NULL; struct ieee80211_ext_chansw_ie *ecsa = NULL; if (!ies) return false; csa_elem = cfg80211_find_elem(WLAN_EID_CHANNEL_SWITCH, ies->data, ies->len); if (csa_elem && csa_elem->datalen == sizeof(*csa)) csa = (void *)csa_elem->data; ecsa_elem = cfg80211_find_elem(WLAN_EID_EXT_CHANSWITCH_ANN, ies->data, ies->len); if (ecsa_elem && ecsa_elem->datalen == sizeof(*ecsa)) ecsa = (void *)ecsa_elem->data; if (csa && csa->count == 0) csa = NULL; if (csa && !csa->mode && csa->new_ch_num == cur_channel) csa = NULL; if (ecsa && ecsa->count == 0) ecsa = NULL; if (ecsa && !ecsa->mode && ecsa->new_ch_num == cur_channel) ecsa = NULL; if (ignore_ecsa && ecsa) { sdata_info(sdata, "Ignoring ECSA in probe response - was considered stuck!\n"); return csa; } return csa || ecsa; } static bool ieee80211_mgd_csa_in_process(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *bss) { u8 cur_channel; bool ret; cur_channel = ieee80211_frequency_to_channel(bss->channel->center_freq); rcu_read_lock(); if (ieee80211_mgd_csa_present(sdata, rcu_dereference(bss->beacon_ies), cur_channel, false)) { ret = true; goto out; } if (ieee80211_mgd_csa_present(sdata, rcu_dereference(bss->proberesp_ies), cur_channel, bss->proberesp_ecsa_stuck)) { ret = true; goto out; } ret = false; out: rcu_read_unlock(); return ret; } static void ieee80211_parse_cfg_selectors(unsigned long *userspace_selectors, const u8 *supported_selectors, u8 supported_selectors_len) { if (supported_selectors) { for (int i = 0; i < supported_selectors_len; i++) { set_bit(supported_selectors[i], userspace_selectors); } } else { /* Assume SAE_H2E support for backward compatibility. */ set_bit(BSS_MEMBERSHIP_SELECTOR_SAE_H2E, userspace_selectors); } } /* config hooks */ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct cfg80211_auth_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_auth_data *auth_data; struct ieee80211_conn_settings conn; struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; struct ieee80211_bss *bss; u16 auth_alg; int err; bool cont_auth, wmm_used; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* prepare auth data structure */ switch (req->auth_type) { case NL80211_AUTHTYPE_OPEN_SYSTEM: auth_alg = WLAN_AUTH_OPEN; break; case NL80211_AUTHTYPE_SHARED_KEY: if (fips_enabled) return -EOPNOTSUPP; auth_alg = WLAN_AUTH_SHARED_KEY; break; case NL80211_AUTHTYPE_FT: auth_alg = WLAN_AUTH_FT; break; case NL80211_AUTHTYPE_NETWORK_EAP: auth_alg = WLAN_AUTH_LEAP; break; case NL80211_AUTHTYPE_SAE: auth_alg = WLAN_AUTH_SAE; break; case NL80211_AUTHTYPE_FILS_SK: auth_alg = WLAN_AUTH_FILS_SK; break; case NL80211_AUTHTYPE_FILS_SK_PFS: auth_alg = WLAN_AUTH_FILS_SK_PFS; break; case NL80211_AUTHTYPE_FILS_PK: auth_alg = WLAN_AUTH_FILS_PK; break; default: return -EOPNOTSUPP; } if (ifmgd->assoc_data) return -EBUSY; if (ieee80211_mgd_csa_in_process(sdata, req->bss)) { sdata_info(sdata, "AP is in CSA process, reject auth\n"); return -EINVAL; } auth_data = kzalloc(sizeof(*auth_data) + req->auth_data_len + req->ie_len, GFP_KERNEL); if (!auth_data) return -ENOMEM; memcpy(auth_data->ap_addr, req->ap_mld_addr ?: req->bss->bssid, ETH_ALEN); auth_data->bss = req->bss; auth_data->link_id = req->link_id; if (req->auth_data_len >= 4) { if (req->auth_type == NL80211_AUTHTYPE_SAE) { __le16 *pos = (__le16 *) req->auth_data; auth_data->sae_trans = le16_to_cpu(pos[0]); auth_data->sae_status = le16_to_cpu(pos[1]); } memcpy(auth_data->data, req->auth_data + 4, req->auth_data_len - 4); auth_data->data_len += req->auth_data_len - 4; } /* Check if continuing authentication or trying to authenticate with the * same BSS that we were in the process of authenticating with and avoid * removal and re-addition of the STA entry in * ieee80211_prep_connection(). */ cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss && ifmgd->auth_data->link_id == req->link_id; if (req->ie && req->ie_len) { memcpy(&auth_data->data[auth_data->data_len], req->ie, req->ie_len); auth_data->data_len += req->ie_len; } if (req->key && req->key_len) { auth_data->key_len = req->key_len; auth_data->key_idx = req->key_idx; memcpy(auth_data->key, req->key, req->key_len); } ieee80211_parse_cfg_selectors(auth_data->userspace_selectors, req->supported_selectors, req->supported_selectors_len); auth_data->algorithm = auth_alg; /* try to authenticate/probe */ if (ifmgd->auth_data) { if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE) { auth_data->peer_confirmed = ifmgd->auth_data->peer_confirmed; } ieee80211_destroy_auth_data(sdata, cont_auth); } /* prep auth_data so we don't go into idle on disassoc */ ifmgd->auth_data = auth_data; /* If this is continuation of an ongoing SAE authentication exchange * (i.e., request to send SAE Confirm) and the peer has already * confirmed, mark authentication completed since we are about to send * out SAE Confirm. */ if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE && auth_data->peer_confirmed && auth_data->sae_trans == 2) ieee80211_mark_sta_auth(sdata); if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; sdata_info(sdata, "disconnect from AP %pM for new auth to %pM\n", sdata->vif.cfg.ap_addr, auth_data->ap_addr); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_UNSPECIFIED, false); } /* needed for transmitting the auth frame(s) properly */ memcpy(sdata->vif.cfg.ap_addr, auth_data->ap_addr, ETH_ALEN); bss = (void *)req->bss->priv; wmm_used = bss->wmm_used && (local->hw.queues >= IEEE80211_NUM_ACS); sband = local->hw.wiphy->bands[req->bss->channel->band]; ieee80211_determine_our_sta_mode_auth(sdata, sband, req, wmm_used, &conn); err = ieee80211_prep_connection(sdata, req->bss, req->link_id, req->ap_mld_addr, cont_auth, &conn, false, auth_data->userspace_selectors); if (err) goto err_clear; if (req->link_id >= 0) link = sdata_dereference(sdata->link[req->link_id], sdata); else link = &sdata->deflink; if (WARN_ON(!link)) { err = -ENOLINK; goto err_clear; } sdata_info(sdata, "authenticate with %pM (local address=%pM)\n", auth_data->ap_addr, link->conf->addr); err = ieee80211_auth(sdata); if (err) { sta_info_destroy_addr(sdata, auth_data->ap_addr); goto err_clear; } /* hold our own reference */ cfg80211_ref_bss(local->hw.wiphy, auth_data->bss); return 0; err_clear: if (!ieee80211_vif_is_mld(&sdata->vif)) { eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); ieee80211_link_release_channel(&sdata->deflink); } ifmgd->auth_data = NULL; kfree(auth_data); return err; } static void ieee80211_setup_assoc_link(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, struct cfg80211_assoc_request *req, struct ieee80211_conn_settings *conn, unsigned int link_id) { struct ieee80211_local *local = sdata->local; const struct cfg80211_bss_ies *bss_ies; struct ieee80211_supported_band *sband; struct ieee80211_link_data *link; struct cfg80211_bss *cbss; struct ieee80211_bss *bss; cbss = assoc_data->link[link_id].bss; if (WARN_ON(!cbss)) return; bss = (void *)cbss->priv; sband = local->hw.wiphy->bands[cbss->channel->band]; if (WARN_ON(!sband)) return; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) return; /* for MLO connections assume advertising all rates is OK */ if (!req->ap_mld_addr) { assoc_data->supp_rates = bss->supp_rates; assoc_data->supp_rates_len = bss->supp_rates_len; } /* copy and link elems for the STA profile */ if (req->links[link_id].elems_len) { memcpy(assoc_data->ie_pos, req->links[link_id].elems, req->links[link_id].elems_len); assoc_data->link[link_id].elems = assoc_data->ie_pos; assoc_data->link[link_id].elems_len = req->links[link_id].elems_len; assoc_data->ie_pos += req->links[link_id].elems_len; } link->u.mgd.beacon_crc_valid = false; link->u.mgd.dtim_period = 0; link->u.mgd.have_beacon = false; /* override HT configuration only if the AP and we support it */ if (conn->mode >= IEEE80211_CONN_MODE_HT) { struct ieee80211_sta_ht_cap sta_ht_cap; memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); } rcu_read_lock(); bss_ies = rcu_dereference(cbss->beacon_ies); if (bss_ies) { u8 dtim_count = 0; ieee80211_get_dtim(bss_ies, &dtim_count, &link->u.mgd.dtim_period); sdata->deflink.u.mgd.have_beacon = true; if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { link->conf->sync_tsf = bss_ies->tsf; link->conf->sync_device_ts = bss->device_ts_beacon; link->conf->sync_dtim_count = dtim_count; } } else { bss_ies = rcu_dereference(cbss->ies); } if (bss_ies) { const struct element *elem; elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, bss_ies->data, bss_ies->len); if (elem && elem->datalen >= 3) link->conf->profile_periodicity = elem->data[2]; else link->conf->profile_periodicity = 0; elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, bss_ies->data, bss_ies->len); if (elem && elem->datalen >= 11 && (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) link->conf->ema_ap = true; else link->conf->ema_ap = false; } rcu_read_unlock(); if (bss->corrupt_data) { char *corrupt_type = "data"; if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_BEACON) { if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_PROBE_RESP) corrupt_type = "beacon and probe response"; else corrupt_type = "beacon"; } else if (bss->corrupt_data & IEEE80211_BSS_CORRUPT_PROBE_RESP) { corrupt_type = "probe response"; } sdata_info(sdata, "associating to AP %pM with corrupt %s\n", cbss->bssid, corrupt_type); } if (link->u.mgd.req_smps == IEEE80211_SMPS_AUTOMATIC) { if (sdata->u.mgd.powersave) link->smps_mode = IEEE80211_SMPS_DYNAMIC; else link->smps_mode = IEEE80211_SMPS_OFF; } else { link->smps_mode = link->u.mgd.req_smps; } } static int ieee80211_mgd_get_ap_ht_vht_capa(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *assoc_data, int link_id) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; enum nl80211_band band = cbss->channel->band; struct ieee80211_supported_band *sband; const struct element *elem; int err; /* neither HT nor VHT elements used on 6 GHz */ if (band == NL80211_BAND_6GHZ) return 0; if (assoc_data->link[link_id].conn.mode < IEEE80211_CONN_MODE_HT) return 0; rcu_read_lock(); elem = ieee80211_bss_get_elem(cbss, WLAN_EID_HT_OPERATION); if (!elem || elem->datalen < sizeof(struct ieee80211_ht_operation)) { mlme_link_id_dbg(sdata, link_id, "no HT operation on BSS %pM\n", cbss->bssid); err = -EINVAL; goto out_rcu; } assoc_data->link[link_id].ap_ht_param = ((struct ieee80211_ht_operation *)(elem->data))->ht_param; rcu_read_unlock(); if (assoc_data->link[link_id].conn.mode < IEEE80211_CONN_MODE_VHT) return 0; /* some drivers want to support VHT on 2.4 GHz even */ sband = sdata->local->hw.wiphy->bands[band]; if (!sband->vht_cap.vht_supported) return 0; rcu_read_lock(); elem = ieee80211_bss_get_elem(cbss, WLAN_EID_VHT_CAPABILITY); /* but even then accept it not being present on the AP */ if (!elem && band == NL80211_BAND_2GHZ) { err = 0; goto out_rcu; } if (!elem || elem->datalen < sizeof(struct ieee80211_vht_cap)) { mlme_link_id_dbg(sdata, link_id, "no VHT capa on BSS %pM\n", cbss->bssid); err = -EINVAL; goto out_rcu; } memcpy(&assoc_data->link[link_id].ap_vht_cap, elem->data, sizeof(struct ieee80211_vht_cap)); rcu_read_unlock(); return 0; out_rcu: rcu_read_unlock(); return err; } int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { unsigned int assoc_link_id = req->link_id < 0 ? 0 : req->link_id; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data; const struct element *ssid_elem; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; struct ieee80211_link_data *link; struct cfg80211_bss *cbss; bool override, uapsd_supported; bool match_auth; int i, err; size_t size = sizeof(*assoc_data) + req->ie_len; for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) size += req->links[i].elems_len; /* FIXME: no support for 4-addr MLO yet */ if (sdata->u.mgd.use_4addr && req->link_id >= 0) return -EOPNOTSUPP; assoc_data = kzalloc(size, GFP_KERNEL); if (!assoc_data) return -ENOMEM; cbss = req->link_id < 0 ? req->bss : req->links[req->link_id].bss; if (ieee80211_mgd_csa_in_process(sdata, cbss)) { sdata_info(sdata, "AP is in CSA process, reject assoc\n"); err = -EINVAL; goto err_free; } rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); if (!ssid_elem || ssid_elem->datalen > sizeof(assoc_data->ssid)) { rcu_read_unlock(); err = -EINVAL; goto err_free; } memcpy(assoc_data->ssid, ssid_elem->data, ssid_elem->datalen); assoc_data->ssid_len = ssid_elem->datalen; rcu_read_unlock(); if (req->ap_mld_addr) memcpy(assoc_data->ap_addr, req->ap_mld_addr, ETH_ALEN); else memcpy(assoc_data->ap_addr, cbss->bssid, ETH_ALEN); assoc_data->ext_mld_capa_ops = cpu_to_le16(req->ext_mld_capa_ops); if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; sdata_info(sdata, "disconnect from AP %pM for new assoc to %pM\n", sdata->vif.cfg.ap_addr, assoc_data->ap_addr); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_UNSPECIFIED, false, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_UNSPECIFIED, false); } memset(sdata->u.mgd.userspace_selectors, 0, sizeof(sdata->u.mgd.userspace_selectors)); ieee80211_parse_cfg_selectors(sdata->u.mgd.userspace_selectors, req->supported_selectors, req->supported_selectors_len); memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa)); memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask, sizeof(ifmgd->ht_capa_mask)); memcpy(&ifmgd->vht_capa, &req->vht_capa, sizeof(ifmgd->vht_capa)); memcpy(&ifmgd->vht_capa_mask, &req->vht_capa_mask, sizeof(ifmgd->vht_capa_mask)); memcpy(&ifmgd->s1g_capa, &req->s1g_capa, sizeof(ifmgd->s1g_capa)); memcpy(&ifmgd->s1g_capa_mask, &req->s1g_capa_mask, sizeof(ifmgd->s1g_capa_mask)); /* keep some setup (AP STA, channel, ...) if matching */ match_auth = ifmgd->auth_data && ether_addr_equal(ifmgd->auth_data->ap_addr, assoc_data->ap_addr) && ifmgd->auth_data->link_id == req->link_id; if (req->ap_mld_addr) { uapsd_supported = true; if (req->flags & (ASSOC_REQ_DISABLE_HT | ASSOC_REQ_DISABLE_VHT | ASSOC_REQ_DISABLE_HE | ASSOC_REQ_DISABLE_EHT)) { err = -EINVAL; goto err_free; } for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { struct ieee80211_supported_band *sband; struct cfg80211_bss *link_cbss = req->links[i].bss; struct ieee80211_bss *bss; if (!link_cbss) continue; bss = (void *)link_cbss->priv; if (!bss->wmm_used) { err = -EINVAL; req->links[i].error = err; goto err_free; } if (link_cbss->channel->band == NL80211_BAND_S1GHZ) { err = -EINVAL; req->links[i].error = err; goto err_free; } link = sdata_dereference(sdata->link[i], sdata); if (link) ether_addr_copy(assoc_data->link[i].addr, link->conf->addr); else eth_random_addr(assoc_data->link[i].addr); sband = local->hw.wiphy->bands[link_cbss->channel->band]; if (match_auth && i == assoc_link_id && link) assoc_data->link[i].conn = link->u.mgd.conn; else assoc_data->link[i].conn = ieee80211_conn_settings_unlimited; ieee80211_determine_our_sta_mode_assoc(sdata, sband, req, true, i, &assoc_data->link[i].conn); assoc_data->link[i].bss = link_cbss; assoc_data->link[i].disabled = req->links[i].disabled; if (!bss->uapsd_supported) uapsd_supported = false; if (assoc_data->link[i].conn.mode < IEEE80211_CONN_MODE_EHT) { err = -EINVAL; req->links[i].error = err; goto err_free; } err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, assoc_data, i); if (err) { err = -EINVAL; req->links[i].error = err; goto err_free; } } assoc_data->wmm = true; } else { struct ieee80211_supported_band *sband; struct ieee80211_bss *bss = (void *)cbss->priv; memcpy(assoc_data->link[0].addr, sdata->vif.addr, ETH_ALEN); assoc_data->s1g = cbss->channel->band == NL80211_BAND_S1GHZ; assoc_data->wmm = bss->wmm_used && (local->hw.queues >= IEEE80211_NUM_ACS); if (cbss->channel->band == NL80211_BAND_6GHZ && req->flags & (ASSOC_REQ_DISABLE_HT | ASSOC_REQ_DISABLE_VHT | ASSOC_REQ_DISABLE_HE)) { err = -EINVAL; goto err_free; } sband = local->hw.wiphy->bands[cbss->channel->band]; assoc_data->link[0].bss = cbss; if (match_auth) assoc_data->link[0].conn = sdata->deflink.u.mgd.conn; else assoc_data->link[0].conn = ieee80211_conn_settings_unlimited; ieee80211_determine_our_sta_mode_assoc(sdata, sband, req, assoc_data->wmm, 0, &assoc_data->link[0].conn); uapsd_supported = bss->uapsd_supported; err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, assoc_data, 0); if (err) goto err_free; } assoc_data->spp_amsdu = req->flags & ASSOC_REQ_SPP_AMSDU; if (ifmgd->auth_data && !ifmgd->auth_data->done) { err = -EBUSY; goto err_free; } if (ifmgd->assoc_data) { err = -EBUSY; goto err_free; } /* Cleanup is delayed if auth_data matches */ if (ifmgd->auth_data && !match_auth) ieee80211_destroy_auth_data(sdata, false); if (req->ie && req->ie_len) { memcpy(assoc_data->ie, req->ie, req->ie_len); assoc_data->ie_len = req->ie_len; assoc_data->ie_pos = assoc_data->ie + assoc_data->ie_len; } else { assoc_data->ie_pos = assoc_data->ie; } if (req->fils_kek) { /* should already be checked in cfg80211 - so warn */ if (WARN_ON(req->fils_kek_len > FILS_MAX_KEK_LEN)) { err = -EINVAL; goto err_free; } memcpy(assoc_data->fils_kek, req->fils_kek, req->fils_kek_len); assoc_data->fils_kek_len = req->fils_kek_len; } if (req->fils_nonces) memcpy(assoc_data->fils_nonces, req->fils_nonces, 2 * FILS_NONCE_LEN); /* default timeout */ assoc_data->timeout = jiffies; assoc_data->timeout_started = true; assoc_data->assoc_link_id = assoc_link_id; if (req->ap_mld_addr) { /* if there was no authentication, set up the link */ err = ieee80211_vif_set_links(sdata, BIT(assoc_link_id), 0); if (err) goto err_clear; } link = sdata_dereference(sdata->link[assoc_link_id], sdata); if (WARN_ON(!link)) { err = -EINVAL; goto err_clear; } override = link->u.mgd.conn.mode != assoc_data->link[assoc_link_id].conn.mode || link->u.mgd.conn.bw_limit != assoc_data->link[assoc_link_id].conn.bw_limit; link->u.mgd.conn = assoc_data->link[assoc_link_id].conn; ieee80211_setup_assoc_link(sdata, assoc_data, req, &link->u.mgd.conn, assoc_link_id); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK), "U-APSD not supported with HW_PS_NULLFUNC_STACK\n")) sdata->vif.driver_flags &= ~IEEE80211_VIF_SUPPORTS_UAPSD; if (assoc_data->wmm && uapsd_supported && (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD)) { assoc_data->uapsd = true; ifmgd->flags |= IEEE80211_STA_UAPSD_ENABLED; } else { assoc_data->uapsd = false; ifmgd->flags &= ~IEEE80211_STA_UAPSD_ENABLED; } if (req->prev_bssid) memcpy(assoc_data->prev_ap_addr, req->prev_bssid, ETH_ALEN); if (req->use_mfp) { ifmgd->mfp = IEEE80211_MFP_REQUIRED; ifmgd->flags |= IEEE80211_STA_MFP_ENABLED; } else { ifmgd->mfp = IEEE80211_MFP_DISABLED; ifmgd->flags &= ~IEEE80211_STA_MFP_ENABLED; } if (req->flags & ASSOC_REQ_USE_RRM) ifmgd->flags |= IEEE80211_STA_ENABLE_RRM; else ifmgd->flags &= ~IEEE80211_STA_ENABLE_RRM; if (req->crypto.control_port) ifmgd->flags |= IEEE80211_STA_CONTROL_PORT; else ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT; sdata->control_port_protocol = req->crypto.control_port_ethertype; sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = req->crypto.control_port_over_nl80211; sdata->control_port_no_preauth = req->crypto.control_port_no_preauth; /* kick off associate process */ ifmgd->assoc_data = assoc_data; for (i = 0; i < ARRAY_SIZE(assoc_data->link); i++) { if (!assoc_data->link[i].bss) continue; if (i == assoc_data->assoc_link_id) continue; /* only calculate the mode, hence link == NULL */ err = ieee80211_prep_channel(sdata, NULL, i, assoc_data->link[i].bss, true, &assoc_data->link[i].conn, sdata->u.mgd.userspace_selectors); if (err) { req->links[i].error = err; goto err_clear; } } memcpy(vif_cfg->ssid, assoc_data->ssid, assoc_data->ssid_len); vif_cfg->ssid_len = assoc_data->ssid_len; /* needed for transmitting the assoc frames properly */ memcpy(sdata->vif.cfg.ap_addr, assoc_data->ap_addr, ETH_ALEN); err = ieee80211_prep_connection(sdata, cbss, req->link_id, req->ap_mld_addr, true, &assoc_data->link[assoc_link_id].conn, override, sdata->u.mgd.userspace_selectors); if (err) goto err_clear; if (ieee80211_hw_check(&sdata->local->hw, NEED_DTIM_BEFORE_ASSOC)) { const struct cfg80211_bss_ies *beacon_ies; rcu_read_lock(); beacon_ies = rcu_dereference(req->bss->beacon_ies); if (!beacon_ies) { /* * Wait up to one beacon interval ... * should this be more if we miss one? */ sdata_info(sdata, "waiting for beacon from %pM\n", link->u.mgd.bssid); assoc_data->timeout = TU_TO_EXP_TIME(req->bss->beacon_interval); assoc_data->timeout_started = true; assoc_data->need_beacon = true; } rcu_read_unlock(); } run_again(sdata, assoc_data->timeout); /* We are associating, clean up auth_data */ if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, true); return 0; err_clear: if (!ifmgd->auth_data) { eth_zero_addr(sdata->deflink.u.mgd.bssid); ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); } ifmgd->assoc_data = NULL; err_free: kfree(assoc_data); return err; } int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct cfg80211_deauth_request *req) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; bool tx = !req->local_state_change; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_DEAUTH, }; if (ifmgd->auth_data && ether_addr_equal(ifmgd->auth_data->ap_addr, req->bssid)) { sdata_info(sdata, "aborting authentication with %pM by local choice (Reason: %u=%s)\n", req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); info.link_id = ifmgd->auth_data->link_id; drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); ieee80211_destroy_auth_data(sdata, false); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } if (ifmgd->assoc_data && ether_addr_equal(ifmgd->assoc_data->ap_addr, req->bssid)) { sdata_info(sdata, "aborting association with %pM by local choice (Reason: %u=%s)\n", req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); info.link_id = ifmgd->assoc_data->assoc_link_id; drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, req->bssid, req->bssid, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); ieee80211_destroy_assoc_data(sdata, ASSOC_ABANDON); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } if (ifmgd->associated && ether_addr_equal(sdata->vif.cfg.ap_addr, req->bssid)) { sdata_info(sdata, "deauthenticating from %pM by local choice (Reason: %u=%s)\n", req->bssid, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); return 0; } return -ENOTCONN; } int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_disassoc_request *req) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; if (!sdata->u.mgd.associated || memcmp(sdata->vif.cfg.ap_addr, req->ap_addr, ETH_ALEN)) return -ENOTCONN; sdata_info(sdata, "disassociating from %pM by local choice (Reason: %u=%s)\n", req->ap_addr, req->reason_code, ieee80211_get_reason_code_string(req->reason_code)); ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DISASSOC, req->reason_code, !req->local_state_change, frame_buf); ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); return 0; } void ieee80211_mgd_stop_link(struct ieee80211_link_data *link) { wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.request_smps_work); wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work); } void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; /* * Make sure some work items will not run after this, * they will not do anything but might not have been * cancelled when disconnecting. */ wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->monitor_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->beacon_connection_loss_work); wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->csa_connection_drop_work); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->tdls_peer_del_work); if (ifmgd->assoc_data) ieee80211_destroy_assoc_data(sdata, ASSOC_TIMEOUT); if (ifmgd->auth_data) ieee80211_destroy_auth_data(sdata, false); spin_lock_bh(&ifmgd->teardown_lock); if (ifmgd->teardown_skb) { kfree_skb(ifmgd->teardown_skb); ifmgd->teardown_skb = NULL; ifmgd->orig_teardown_skb = NULL; } kfree(ifmgd->assoc_req_ies); ifmgd->assoc_req_ies = NULL; ifmgd->assoc_req_ies_len = 0; spin_unlock_bh(&ifmgd->teardown_lock); timer_delete_sync(&ifmgd->timer); } void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, enum nl80211_cqm_rssi_threshold_event rssi_event, s32 rssi_level, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_cqm_rssi_notify(sdata, rssi_event, rssi_level); cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, rssi_level, gfp); } EXPORT_SYMBOL(ieee80211_cqm_rssi_notify); void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_cqm_beacon_loss_notify(sdata->local, sdata); cfg80211_cqm_beacon_loss_notify(sdata->dev, gfp); } EXPORT_SYMBOL(ieee80211_cqm_beacon_loss_notify); static void _ieee80211_enable_rssi_reports(struct ieee80211_sub_if_data *sdata, int rssi_min_thold, int rssi_max_thold) { trace_api_enable_rssi_reports(sdata, rssi_min_thold, rssi_max_thold); if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) return; /* * Scale up threshold values before storing it, as the RSSI averaging * algorithm uses a scaled up value as well. Change this scaling * factor if the RSSI averaging algorithm changes. */ sdata->u.mgd.rssi_min_thold = rssi_min_thold*16; sdata->u.mgd.rssi_max_thold = rssi_max_thold*16; } void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); WARN_ON(rssi_min_thold == rssi_max_thold || rssi_min_thold > rssi_max_thold); _ieee80211_enable_rssi_reports(sdata, rssi_min_thold, rssi_max_thold); } EXPORT_SYMBOL(ieee80211_enable_rssi_reports); void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); _ieee80211_enable_rssi_reports(sdata, 0, 0); } EXPORT_SYMBOL(ieee80211_disable_rssi_reports); void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *add_links_data = ifmgd->reconf.add_links_data; struct sta_info *sta; struct cfg80211_mlo_reconf_done_data done_data = {}; u16 sta_changed_links = sdata->u.mgd.reconf.added_links | sdata->u.mgd.reconf.removed_links; u16 link_mask, valid_links; unsigned int link_id; size_t orig_len = len; u8 i, group_key_data_len; u8 *pos; if (!ieee80211_vif_is_mld(&sdata->vif) || len < offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp) || mgmt->u.action.u.ml_reconf_resp.dialog_token != sdata->u.mgd.reconf.dialog_token || !sta_changed_links) return; pos = mgmt->u.action.u.ml_reconf_resp.variable; len -= offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp); /* each status duple is 3 octets */ if (len < mgmt->u.action.u.ml_reconf_resp.count * 3) { sdata_info(sdata, "mlo: reconf: unexpected len=%zu, count=%u\n", len, mgmt->u.action.u.ml_reconf_resp.count); goto disconnect; } link_mask = sta_changed_links; for (i = 0; i < mgmt->u.action.u.ml_reconf_resp.count; i++) { u16 status = get_unaligned_le16(pos + 1); link_id = *pos; if (!(link_mask & BIT(link_id))) { sdata_info(sdata, "mlo: reconf: unexpected link: %u, changed=0x%x\n", link_id, sta_changed_links); goto disconnect; } /* clear the corresponding link, to detect the case that * the same link was included more than one time */ link_mask &= ~BIT(link_id); /* Handle failure to remove links here. Failure to remove added * links will be done later in the flow. */ if (status != WLAN_STATUS_SUCCESS) { sdata_info(sdata, "mlo: reconf: failed on link=%u, status=%u\n", link_id, status); /* The AP MLD failed to remove a link that was already * removed locally. As this is not expected behavior, * disconnect */ if (sdata->u.mgd.reconf.removed_links & BIT(link_id)) goto disconnect; /* The AP MLD failed to add a link. Remove it from the * added links. */ sdata->u.mgd.reconf.added_links &= ~BIT(link_id); } pos += 3; len -= 3; } if (link_mask) { sdata_info(sdata, "mlo: reconf: no response for links=0x%x\n", link_mask); goto disconnect; } if (!sdata->u.mgd.reconf.added_links) goto out; if (len < 1 || len < 1 + *pos) { sdata_info(sdata, "mlo: reconf: invalid group key data length"); goto disconnect; } /* The Group Key Data field must be present when links are added. This * field should be processed by userland. */ group_key_data_len = *pos++; pos += group_key_data_len; len -= group_key_data_len + 1; /* Process the information for the added links */ sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!sta)) goto disconnect; valid_links = sdata->vif.valid_links; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!add_links_data->link[link_id].bss || !(sdata->u.mgd.reconf.added_links & BIT(link_id))) continue; valid_links |= BIT(link_id); if (ieee80211_sta_allocate_link(sta, link_id)) goto disconnect; } ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links); link_mask = 0; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = add_links_data->link[link_id].bss; struct ieee80211_link_data *link; struct link_sta_info *link_sta; u64 changed = 0; if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); if (WARN_ON(!link)) goto disconnect; link_info(link, "mlo: reconf: local address %pM, AP link address %pM\n", add_links_data->link[link_id].addr, add_links_data->link[link_id].bss->bssid); link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); if (WARN_ON(!link_sta)) goto disconnect; if (!link->u.mgd.have_beacon) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); ies = rcu_dereference(cbss->beacon_ies); if (ies) link->u.mgd.have_beacon = true; else ies = rcu_dereference(cbss->ies); ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, &link->u.mgd.dtim_period); link->conf->beacon_int = cbss->beacon_interval; rcu_read_unlock(); } link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; link->u.mgd.conn = add_links_data->link[link_id].conn; if (ieee80211_prep_channel(sdata, link, link_id, cbss, true, &link->u.mgd.conn, sdata->u.mgd.userspace_selectors)) { link_info(link, "mlo: reconf: prep_channel failed\n"); goto disconnect; } if (ieee80211_mgd_setup_link_sta(link, sta, link_sta, add_links_data->link[link_id].bss)) goto disconnect; if (!ieee80211_assoc_config_link(link, link_sta, add_links_data->link[link_id].bss, mgmt, pos, len, &changed)) goto disconnect; /* The AP MLD indicated success for this link, but the station * profile status indicated otherwise. Since there is an * inconsistency in the ML reconfiguration response, disconnect */ if (add_links_data->link[link_id].status != WLAN_STATUS_SUCCESS) goto disconnect; ieee80211_sta_init_nss(link_sta); if (ieee80211_sta_activate_link(sta, link_id)) goto disconnect; changed |= ieee80211_link_set_associated(link, cbss); ieee80211_link_info_change_notify(sdata, link, changed); ieee80211_recalc_smps(sdata, link); link_mask |= BIT(link_id); } sdata_info(sdata, "mlo: reconf: current valid_links=0x%x, added=0x%x\n", valid_links, link_mask); /* links might have changed due to rejected ones, set them again */ ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links); ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); done_data.buf = (const u8 *)mgmt; done_data.len = orig_len; done_data.added_links = link_mask; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { done_data.links[link_id].bss = add_links_data->link[link_id].bss; done_data.links[link_id].addr = add_links_data->link[link_id].addr; } cfg80211_mlo_reconf_add_done(sdata->dev, &done_data); kfree(sdata->u.mgd.reconf.add_links_data); sdata->u.mgd.reconf.add_links_data = NULL; out: ieee80211_ml_reconf_reset(sdata); return; disconnect: __ieee80211_disconnect(sdata); } static struct sk_buff * ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgd_assoc_data *add_links_data, u16 removed_links, __le16 ext_mld_capa_ops) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct ieee80211_multi_link_elem *ml_elem; struct ieee80211_mle_basic_common_info *common; enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); struct sk_buff *skb; size_t size; unsigned int link_id; __le16 eml_capa = 0, mld_capa_ops = 0; struct ieee80211_tx_info *info; u8 common_size, var_common_size; u8 *ml_elem_len; u16 capab = 0; size = local->hw.extra_tx_headroom + sizeof(*mgmt); /* Consider the maximal length of the reconfiguration ML element */ size += sizeof(struct ieee80211_multi_link_elem); /* The Basic ML element and the Reconfiguration ML element have the same * fixed common information fields in the context of ML reconfiguration * action frame. The AP MLD MAC address must always be present */ common_size = sizeof(*common); /* when adding links, the MLD capabilities must be present */ var_common_size = 0; if (add_links_data) { const struct wiphy_iftype_ext_capab *ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, ieee80211_vif_type_p2p(&sdata->vif)); if (ift_ext_capa) { eml_capa = cpu_to_le16(ift_ext_capa->eml_capabilities); mld_capa_ops = cpu_to_le16(ift_ext_capa->mld_capa_and_ops); } /* MLD capabilities and operation */ var_common_size += 2; /* EML capabilities */ if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT))) var_common_size += 2; } if (ext_mld_capa_ops) var_common_size += 2; /* Add the common information length */ size += common_size + var_common_size; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss; size_t elems_len; if (removed_links & BIT(link_id)) { size += sizeof(struct ieee80211_mle_per_sta_profile) + ETH_ALEN; continue; } if (!add_links_data || !add_links_data->link[link_id].bss) continue; elems_len = add_links_data->link[link_id].elems_len; cbss = add_links_data->link[link_id].bss; /* should be the same across all BSSes */ if (cbss->capability & WLAN_CAPABILITY_PRIVACY) capab |= WLAN_CAPABILITY_PRIVACY; size += 2 + sizeof(struct ieee80211_mle_per_sta_profile) + ETH_ALEN; /* WMM */ size += 9; size += ieee80211_link_common_elems_size(sdata, iftype, cbss, elems_len); } skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = skb_put_zero(skb, offsetofend(struct ieee80211_mgmt, u.action.u.ml_reconf_req)); /* Add the MAC header */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); /* Add the action frame fixed fields */ mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; mgmt->u.action.u.ml_reconf_req.action_code = WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ; /* allocate a dialog token and store it */ sdata->u.mgd.reconf.dialog_token = ++sdata->u.mgd.dialog_token_alloc; mgmt->u.action.u.ml_reconf_req.dialog_token = sdata->u.mgd.reconf.dialog_token; /* Add the ML reconfiguration element and the common information */ skb_put_u8(skb, WLAN_EID_EXTENSION); ml_elem_len = skb_put(skb, 1); skb_put_u8(skb, WLAN_EID_EXT_EHT_MULTI_LINK); ml_elem = skb_put(skb, sizeof(*ml_elem)); ml_elem->control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_RECONF | IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR); common = skb_put(skb, common_size); common->len = common_size + var_common_size; memcpy(common->mld_mac_addr, sdata->vif.addr, ETH_ALEN); if (add_links_data) { if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | IEEE80211_EML_CAP_EMLMR_SUPPORT))) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_RECONF_PRES_EML_CAPA); skb_put_data(skb, &eml_capa, sizeof(eml_capa)); } ml_elem->control |= cpu_to_le16(IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP); skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); } if (ext_mld_capa_ops) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP); skb_put_data(skb, &ext_mld_capa_ops, sizeof(ext_mld_capa_ops)); } if (sdata->u.mgd.flags & IEEE80211_STA_ENABLE_RRM) capab |= WLAN_CAPABILITY_RADIO_MEASURE; /* Add the per station profile */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { u8 *subelem_len = NULL; u16 ctrl; const u8 *addr; /* Skip links that are not changing */ if (!(removed_links & BIT(link_id)) && (!add_links_data || !add_links_data->link[link_id].bss)) continue; ctrl = link_id | IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT; if (removed_links & BIT(link_id)) { struct ieee80211_bss_conf *conf = sdata_dereference(sdata->vif.link_conf[link_id], sdata); if (!conf) continue; addr = conf->addr; ctrl |= u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK, IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE); } else { addr = add_links_data->link[link_id].addr; ctrl |= IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE | u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK, IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE); } skb_put_u8(skb, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE); subelem_len = skb_put(skb, 1); put_unaligned_le16(ctrl, skb_put(skb, sizeof(ctrl))); skb_put_u8(skb, 1 + ETH_ALEN); skb_put_data(skb, addr, ETH_ALEN); if (!(removed_links & BIT(link_id))) { u16 link_present_elems[PRESENT_ELEMS_MAX] = {}; size_t extra_used; void *capab_pos; u8 qos_info; capab_pos = skb_put(skb, 2); extra_used = ieee80211_add_link_elems(sdata, skb, &capab, NULL, add_links_data->link[link_id].elems, add_links_data->link[link_id].elems_len, link_id, NULL, link_present_elems, add_links_data); if (add_links_data->link[link_id].elems) skb_put_data(skb, add_links_data->link[link_id].elems + extra_used, add_links_data->link[link_id].elems_len - extra_used); if (sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED) { qos_info = sdata->u.mgd.uapsd_queues; qos_info |= (sdata->u.mgd.uapsd_max_sp_len << IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); } else { qos_info = 0; } ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info); put_unaligned_le16(capab, capab_pos); } ieee80211_fragment_element(skb, subelem_len, IEEE80211_MLE_SUBELEM_FRAGMENT); } ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT); info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; return skb; } int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, struct cfg80211_ml_reconf_req *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_assoc_data *data = NULL; struct sta_info *sta; struct sk_buff *skb; u16 added_links, new_valid_links; int link_id, err; if (!ieee80211_vif_is_mld(&sdata->vif) || !(sdata->vif.cfg.mld_capa_op & IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT)) return -EINVAL; /* No support for concurrent ML reconfiguration operation */ if (sdata->u.mgd.reconf.added_links || sdata->u.mgd.reconf.removed_links) return -EBUSY; added_links = 0; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!req->add_links[link_id].bss) continue; added_links |= BIT(link_id); } sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (WARN_ON(!sta)) return -ENOLINK; /* Adding links to the set of valid link is done only after a successful * ML reconfiguration frame exchange. Here prepare the data for the ML * reconfiguration frame construction and allocate the required * resources */ if (added_links) { bool uapsd_supported; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->assoc_link_id = -1; data->wmm = true; uapsd_supported = true; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct ieee80211_supported_band *sband; struct cfg80211_bss *link_cbss = req->add_links[link_id].bss; struct ieee80211_bss *bss; if (!link_cbss) continue; bss = (void *)link_cbss->priv; if (!bss->wmm_used) { err = -EINVAL; goto err_free; } if (link_cbss->channel->band == NL80211_BAND_S1GHZ) { err = -EINVAL; goto err_free; } eth_random_addr(data->link[link_id].addr); data->link[link_id].conn = ieee80211_conn_settings_unlimited; sband = local->hw.wiphy->bands[link_cbss->channel->band]; ieee80211_determine_our_sta_mode(sdata, sband, NULL, true, link_id, &data->link[link_id].conn); data->link[link_id].bss = link_cbss; data->link[link_id].disabled = req->add_links[link_id].disabled; data->link[link_id].elems = (u8 *)req->add_links[link_id].elems; data->link[link_id].elems_len = req->add_links[link_id].elems_len; if (!bss->uapsd_supported) uapsd_supported = false; if (data->link[link_id].conn.mode < IEEE80211_CONN_MODE_EHT) { err = -EINVAL; goto err_free; } err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, data, link_id); if (err) { err = -EINVAL; goto err_free; } } /* Require U-APSD support if we enabled it */ if (sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED && !uapsd_supported) { err = -EINVAL; sdata_info(sdata, "U-APSD on but not available on (all) new links\n"); goto err_free; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!data->link[link_id].bss) continue; /* only used to verify the mode, nothing is allocated */ err = ieee80211_prep_channel(sdata, NULL, link_id, data->link[link_id].bss, true, &data->link[link_id].conn, sdata->u.mgd.userspace_selectors); if (err) goto err_free; } } /* link removal is done before the ML reconfiguration frame exchange so * that these links will not be used between their removal by the AP MLD * and before the station got the ML reconfiguration response. Based on * Section 35.3.6.4 in Draft P802.11be_D7.0 the AP MLD should accept the * link removal request. */ if (req->rem_links) { u16 new_active_links = sdata->vif.active_links & ~req->rem_links; new_valid_links = sdata->vif.valid_links & ~req->rem_links; /* Should not be left with no valid links to perform the * ML reconfiguration */ if (!new_valid_links || !(new_valid_links & ~sdata->vif.dormant_links)) { sdata_info(sdata, "mlo: reconf: no valid links\n"); err = -EINVAL; goto err_free; } if (new_active_links != sdata->vif.active_links) { if (!new_active_links) new_active_links = BIT(__ffs(new_valid_links & ~sdata->vif.dormant_links)); err = ieee80211_set_active_links(&sdata->vif, new_active_links); if (err) { sdata_info(sdata, "mlo: reconf: failed set active links\n"); goto err_free; } } } /* Build the SKB before the link removal as the construction of the * station info for removed links requires the local address. * Invalidate the removed links, so that the transmission of the ML * reconfiguration request frame would not be done using them, as the AP * is expected to send the ML reconfiguration response frame on the link * on which the request was received. */ skb = ieee80211_build_ml_reconf_req(sdata, data, req->rem_links, cpu_to_le16(req->ext_mld_capa_ops)); if (!skb) { err = -ENOMEM; goto err_free; } if (req->rem_links) { u16 new_dormant_links = sdata->vif.dormant_links & ~req->rem_links; err = ieee80211_vif_set_links(sdata, new_valid_links, new_dormant_links); if (err) { sdata_info(sdata, "mlo: reconf: failed set valid links\n"); kfree_skb(skb); goto err_free; } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!(req->rem_links & BIT(link_id))) continue; ieee80211_sta_remove_link(sta, link_id); } /* notify the driver and upper layers */ ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); cfg80211_links_removed(sdata->dev, req->rem_links); } sdata_info(sdata, "mlo: reconf: adding=0x%x, removed=0x%x\n", added_links, req->rem_links); ieee80211_tx_skb(sdata, skb); sdata->u.mgd.reconf.added_links = added_links; sdata->u.mgd.reconf.add_links_data = data; sdata->u.mgd.reconf.removed_links = req->rem_links; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.reconf.wk, IEEE80211_ASSOC_TIMEOUT_SHORT); return 0; err_free: kfree(data); return err; } static bool ieee80211_mgd_epcs_supp(struct ieee80211_sub_if_data *sdata) { unsigned long valid_links = sdata->vif.valid_links; u8 link_id; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ieee80211_vif_is_mld(&sdata->vif)) return false; for_each_set_bit(link_id, &valid_links, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_bss_conf *bss_conf = sdata_dereference(sdata->vif.link_conf[link_id], sdata); if (WARN_ON(!bss_conf) || !bss_conf->epcs_support) return false; } return true; } int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) { struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; int frame_len = offsetofend(struct ieee80211_mgmt, u.action.u.epcs) + (enable ? 1 : 0); if (!ieee80211_mgd_epcs_supp(sdata)) return -EINVAL; if (sdata->u.mgd.epcs.enabled == enable && !sdata->u.mgd.epcs.dialog_token) return 0; /* Do not allow enabling EPCS if the AP didn't respond yet. * However, allow disabling EPCS in such a case. */ if (sdata->u.mgd.epcs.dialog_token && enable) return -EALREADY; skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); if (!skb) return -ENOBUFS; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = skb_put_zero(skb, frame_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; if (enable) { u8 *pos = mgmt->u.action.u.epcs.variable; mgmt->u.action.u.epcs.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ; *pos = ++sdata->u.mgd.dialog_token_alloc; sdata->u.mgd.epcs.dialog_token = *pos; } else { mgmt->u.action.u.epcs.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN; ieee80211_epcs_teardown(sdata); ieee80211_epcs_changed(sdata, false); } ieee80211_tx_skb(sdata, skb); return 0; } static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems) { const struct element *sub; size_t scratch_len = elems->ml_epcs_len; u8 *scratch __free(kfree) = kzalloc(scratch_len, GFP_KERNEL); lockdep_assert_wiphy(sdata->local->hw.wiphy); if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_epcs) return; if (WARN_ON(!scratch)) return; /* Directly parse the sub elements as the common information doesn't * hold any useful information. */ for_each_mle_subelement(sub, (const u8 *)elems->ml_epcs, elems->ml_epcs_len) { struct ieee80211_link_data *link; struct ieee802_11_elems *link_elems __free(kfree); u8 *pos = (void *)sub->data; u16 control; ssize_t len; u8 link_id; if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) continue; if (sub->datalen < sizeof(control)) break; control = get_unaligned_le16(pos); link_id = control & IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID; link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; len = cfg80211_defragment_element(sub, (u8 *)elems->ml_epcs, elems->ml_epcs_len, scratch, scratch_len, IEEE80211_MLE_SUBELEM_FRAGMENT); if (len < (ssize_t)sizeof(control)) continue; pos = scratch + sizeof(control); len -= sizeof(control); link_elems = ieee802_11_parse_elems(pos, len, false, NULL); if (!link_elems) continue; if (ieee80211_sta_wmm_params(sdata->local, link, link_elems->wmm_param, link_elems->wmm_param_len, link_elems->mu_edca_param_set)) ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); } } void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee802_11_elems *elems __free(kfree) = NULL; size_t ies_len; u16 status_code; u8 *pos, dialog_token; if (!ieee80211_mgd_epcs_supp(sdata)) return; /* Handle dialog token and status code */ pos = mgmt->u.action.u.epcs.variable; dialog_token = *pos; status_code = get_unaligned_le16(pos + 1); /* An EPCS enable response with dialog token == 0 is an unsolicited * notification from the AP MLD. In such a case, EPCS should already be * enabled and status must be success */ if (!dialog_token && (!sdata->u.mgd.epcs.enabled || status_code != WLAN_STATUS_SUCCESS)) return; if (sdata->u.mgd.epcs.dialog_token != dialog_token) return; sdata->u.mgd.epcs.dialog_token = 0; if (status_code != WLAN_STATUS_SUCCESS) return; pos += IEEE80211_EPCS_ENA_RESP_BODY_LEN; ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.epcs.variable) - IEEE80211_EPCS_ENA_RESP_BODY_LEN; elems = ieee802_11_parse_elems(pos, ies_len, true, NULL); if (!elems) return; ieee80211_ml_epcs(sdata, elems); ieee80211_epcs_changed(sdata, true); } void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { if (!ieee80211_vif_is_mld(&sdata->vif) || !sdata->u.mgd.epcs.enabled) return; ieee80211_epcs_teardown(sdata); ieee80211_epcs_changed(sdata, false); } |
| 2475 2476 1197 4 4 4 1531 7 7 1198 1197 4 1197 1198 1530 1530 6 1531 1532 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. */ #include "vma_internal.h" #include "vma.h" /* SLAB cache for vm_area_struct structures */ static struct kmem_cache *vm_area_cachep; void __init vma_state_init(void) { struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), }; vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), &args, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT); } struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!vma) return NULL; vma_init(vma, mm); return vma; } static void vm_area_init_from(const struct vm_area_struct *src, struct vm_area_struct *dest) { dest->vm_mm = src->vm_mm; dest->vm_ops = src->vm_ops; dest->vm_start = src->vm_start; dest->vm_end = src->vm_end; dest->anon_vma = src->anon_vma; dest->vm_pgoff = src->vm_pgoff; dest->vm_file = src->vm_file; dest->vm_private_data = src->vm_private_data; vm_flags_init(dest, src->vm_flags); memcpy(&dest->vm_page_prot, &src->vm_page_prot, sizeof(dest->vm_page_prot)); /* * src->shared.rb may be modified concurrently when called from * dup_mmap(), but the clone will reinitialize it. */ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, sizeof(dest->vm_userfaultfd_ctx)); #ifdef CONFIG_ANON_VMA_NAME dest->anon_name = src->anon_name; #endif #ifdef CONFIG_SWAP memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, sizeof(dest->swap_readahead_info)); #endif #ifndef CONFIG_MMU dest->vm_region = src->vm_region; #endif #ifdef CONFIG_NUMA dest->vm_policy = src->vm_policy; #endif #ifdef __HAVE_PFNMAP_TRACKING dest->pfnmap_track_ctx = NULL; #endif } #ifdef __HAVE_PFNMAP_TRACKING static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, struct vm_area_struct *new) { struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx; if (likely(!ctx)) return 0; /* * We don't expect to ever hit this. If ever required, we would have * to duplicate the tracking. */ if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX)) return -ENOMEM; kref_get(&ctx->kref); new->pfnmap_track_ctx = ctx; return 0; } static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) { struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx; if (likely(!ctx)) return; kref_put(&ctx->kref, pfnmap_track_ctx_release); vma->pfnmap_track_ctx = NULL; } #else static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, struct vm_area_struct *new) { return 0; } static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) { } #endif struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) return NULL; ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); vm_area_init_from(orig, new); if (vma_pfnmap_track_ctx_dup(orig, new)) { kmem_cache_free(vm_area_cachep, new); return NULL; } vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); return new; } void vm_area_free(struct vm_area_struct *vma) { /* The vma should be detached while being destroyed. */ vma_assert_detached(vma); vma_numab_state_free(vma); free_anon_vma_name(vma); vma_pfnmap_track_ctx_release(vma); kmem_cache_free(vm_area_cachep, vma); } |
| 16 16 15 16 11359 11346 2841 11308 542 543 543 542 1436 1433 1433 1434 1434 1437 1430 1434 516 516 514 514 513 514 514 514 117 117 116 98 98 98 98 98 98 60 60 60 60 60 98 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /* * Add to a counter while respecting batch size. * * There are 2 implementations, both dealing with the following problem: * * The decision slow path/fast path and the actual update must be atomic. * Otherwise a call in process context could check the current values and * decide that the fast path can be used. If now an interrupt occurs before * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), * then the this_cpu_add() that is executed after the interrupt has completed * can produce values larger than "batch" or even overflows. */ #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * Safety against interrupts is achieved in 2 ways: * 1. the fast path uses local cmpxchg (note: no lock prefix) * 2. the slow path operates with interrupts disabled */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; count = this_cpu_read(*fbc->counters); do { if (unlikely(abs(count + amount) >= batch)) { raw_spin_lock_irqsave(&fbc->lock, flags); /* * Note: by now we might have migrated to another CPU * or the value might have changed. */ count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); return; } } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount)); } #else /* * local_irq_save() is used to make the function irq safe: * - The slow path would be ok as protected by an irq-safe spinlock. * - this_cpu_add would be ok as it is irq-safe by definition. */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; local_irq_save(flags); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock(&fbc->lock); } else { this_cpu_add(*fbc->counters, amount); } local_irq_restore(flags); } #endif EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). * * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums * from CPUs that are in the process of being taken offline. Dying cpus have * been removed from the online mask, but may not have had the hotplug dead * notifier called to fold the percpu count back into the global counter sum. * By including dying CPUs in the iteration mask, we avoid this race condition * so __percpu_counter_sum() just does the right thing when CPUs are being taken * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key) { unsigned long flags __maybe_unused; size_t counter_size; s32 __percpu *counters; u32 i; counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); counters = __alloc_percpu_gfp(nr_counters * counter_size, __alignof__(*counters), gfp); if (!counters) { fbc[0].counters = NULL; return -ENOMEM; } for (i = 0; i < nr_counters; i++) { raw_spin_lock_init(&fbc[i].lock); lockdep_set_class(&fbc[i].lock, key); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif fbc[i].count = amount; fbc[i].counters = (void __percpu *)counters + i * counter_size; debug_percpu_counter_activate(&fbc[i]); } #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init_many); void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; u32 i; if (WARN_ON_ONCE(!fbc)) return; if (!fbc[0].counters) return; for (i = 0; i < nr_counters; i++) debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc[0].counters); for (i = 0; i < nr_counters; i++) fbc[i].counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); /* * Compare counter, and add amount if total is: less than or equal to limit if * amount is positive, or greater than or equal to limit if amount is negative. * Return true if amount is added, or false if total would be beyond the limit. * * Negative limit is allowed, but unusual. * When negative amounts (subs) are given to percpu_counter_limited_add(), * the limit would most naturally be 0 - but other limits are also allowed. * * Overflow beyond S64_MAX is not allowed for: counter, limit and amount * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) { s64 count; s64 unknown; unsigned long flags; bool good = false; if (amount == 0) return true; local_irq_save(flags); unknown = batch * num_online_cpus(); count = __this_cpu_read(*fbc->counters); /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && ((amount > 0 && fbc->count + unknown <= limit) || (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; } raw_spin_lock(&fbc->lock); count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ if (amount > 0) { if (count - unknown > limit) goto out; if (count + unknown <= limit) good = true; } else { if (count + unknown < limit) goto out; if (count - unknown >= limit) good = true; } if (!good) { s32 *pcount; int cpu; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } if (amount > 0) { if (count > limit) goto out; } else { if (count < limit) goto out; } good = true; } count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; } static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup); |
| 1 1 1 1 1 1 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 | // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" #include <linux/hashtable.h> #include <linux/seq_file.h> /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ static struct kmem_cache *orangefs_inode_cache; /* list for storing orangefs specific superblocks in use */ LIST_HEAD(orangefs_superblocks); DEFINE_SPINLOCK(orangefs_superblocks_lock); enum { Opt_acl, Opt_intr, Opt_local_lock, }; const struct fs_parameter_spec orangefs_fs_param_spec[] = { fsparam_flag ("acl", Opt_acl), fsparam_flag ("intr", Opt_intr), fsparam_flag ("local_lock", Opt_local_lock), {} }; uint64_t orangefs_features; static int orangefs_show_options(struct seq_file *m, struct dentry *root) { struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb); if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); if (orangefs_sb->flags & ORANGEFS_OPT_INTR) seq_puts(m, ",intr"); if (orangefs_sb->flags & ORANGEFS_OPT_LOCAL_LOCK) seq_puts(m, ",local_lock"); return 0; } static int orangefs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct orangefs_sb_info_s *orangefs_sb = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, orangefs_fs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_acl: fc->sb_flags |= SB_POSIXACL; break; case Opt_intr: orangefs_sb->flags |= ORANGEFS_OPT_INTR; break; case Opt_local_lock: orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; break; } return 0; } static void orangefs_inode_cache_ctor(void *req) { struct orangefs_inode_s *orangefs_inode = req; inode_init_once(&orangefs_inode->vfs_inode); init_rwsem(&orangefs_inode->xattr_sem); } static struct inode *orangefs_alloc_inode(struct super_block *sb) { struct orangefs_inode_s *orangefs_inode; orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL); if (!orangefs_inode) return NULL; /* * We want to clear everything except for rw_semaphore and the * vfs_inode. */ memset(&orangefs_inode->refn.khandle, 0, 16); orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL; orangefs_inode->last_failed_block_index_read = 0; memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target)); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_alloc_inode: allocated %p\n", &orangefs_inode->vfs_inode); return &orangefs_inode->vfs_inode; } static void orangefs_free_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_cached_xattr *cx; struct hlist_node *tmp; int i; hash_for_each_safe(orangefs_inode->xattr_cache, i, tmp, cx, node) { hlist_del(&cx->node); kfree(cx); } kmem_cache_free(orangefs_inode_cache, orangefs_inode); } static void orangefs_destroy_inode(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); gossip_debug(GOSSIP_SUPER_DEBUG, "%s: deallocated %p destroying inode %pU\n", __func__, orangefs_inode, get_khandle_from_ino(inode)); } static int orangefs_write_inode(struct inode *inode, struct writeback_control *wbc) { gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n"); return orangefs_inode_setattr(inode); } /* * NOTE: information filled in here is typically reflected in the * output of the system command 'df' */ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) { int ret = -ENOMEM; struct orangefs_kernel_op_s *new_op = NULL; int flags = 0; struct super_block *sb = NULL; sb = dentry->d_sb; gossip_debug(GOSSIP_SUPER_DEBUG, "%s: called on sb %p (fs_id is %d)\n", __func__, sb, (int)(ORANGEFS_SB(sb)->fs_id)); new_op = op_alloc(ORANGEFS_VFS_OP_STATFS); if (!new_op) return ret; new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id; if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR) flags = ORANGEFS_OP_INTERRUPTIBLE; ret = service_operation(new_op, "orangefs_statfs", flags); if (new_op->downcall.status < 0) goto out_op_release; gossip_debug(GOSSIP_SUPER_DEBUG, "%s: got %ld blocks available | " "%ld blocks total | %ld block size | " "%ld files total | %ld files avail\n", __func__, (long)new_op->downcall.resp.statfs.blocks_avail, (long)new_op->downcall.resp.statfs.blocks_total, (long)new_op->downcall.resp.statfs.block_size, (long)new_op->downcall.resp.statfs.files_total, (long)new_op->downcall.resp.statfs.files_avail); buf->f_type = sb->s_magic; buf->f_fsid.val[0] = ORANGEFS_SB(sb)->fs_id; buf->f_fsid.val[1] = ORANGEFS_SB(sb)->id; buf->f_bsize = new_op->downcall.resp.statfs.block_size; buf->f_namelen = ORANGEFS_NAME_MAX; buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total; buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total; buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail; buf->f_frsize = 0; out_op_release: op_release(new_op); gossip_debug(GOSSIP_SUPER_DEBUG, "%s: returning %d\n", __func__, ret); return ret; } /* * Remount as initiated by VFS layer. We just need to reparse the mount * options, no need to signal pvfs2-client-core about it. */ static int orangefs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); struct orangefs_sb_info_s *revised = fc->s_fs_info; unsigned int flags; flags = orangefs_sb->flags; flags &= ~(ORANGEFS_OPT_INTR | ORANGEFS_OPT_LOCAL_LOCK); flags |= revised->flags; WRITE_ONCE(orangefs_sb->flags, flags); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_reconfigure: called\n"); return 0; } /* * Remount as initiated by pvfs2-client-core on restart. This is used to * repopulate mount information left from previous pvfs2-client-core. * * the idea here is that given a valid superblock, we're * re-initializing the user space client with the initial mount * information specified when the super block was first initialized. * this is very different than the first initialization/creation of a * superblock. we use the special service_priority_operation to make * sure that the mount gets ahead of any other pending operation that * is waiting for servicing. this means that the pvfs2-client won't * fail to start several times for all other pending operations before * the client regains all of the mount information from us. * NOTE: this function assumes that the request_mutex is already acquired! */ int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb) { struct orangefs_kernel_op_s *new_op; int ret = -EINVAL; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n"); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) return -ENOMEM; strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, orangefs_sb->devname); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Remount via host %s\n", new_op->upcall.req.fs_mount.orangefs_config_server); /* * we assume that the calling function has already acquired the * request_mutex to prevent other operations from bypassing * this one */ ret = service_operation(new_op, "orangefs_remount", ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: mount got return value of %d\n", ret); if (ret == 0) { /* * store the id assigned to this sb -- it's just a * short-lived mapping that the system interface uses * to map this superblock to a particular mount entry */ orangefs_sb->id = new_op->downcall.resp.fs_mount.id; orangefs_sb->mount_pending = 0; } op_release(new_op); if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX); if (!ret) orangefs_features = new_op->downcall.resp.features.features; else orangefs_features = 0; op_release(new_op); } else { orangefs_features = 0; } return ret; } int fsid_key_table_initialize(void) { return 0; } void fsid_key_table_finalize(void) { } static const struct super_operations orangefs_s_ops = { .alloc_inode = orangefs_alloc_inode, .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; static struct dentry *orangefs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct orangefs_object_kref refn; if (fh_len < 5 || fh_type > 2) return NULL; ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16); refn.fs_id = (u32) fid->raw[4]; gossip_debug(GOSSIP_SUPER_DEBUG, "fh_to_dentry: handle %pU, fs_id %d\n", &refn.khandle, refn.fs_id); return d_obtain_alias(orangefs_iget(sb, &refn)); } static int orangefs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { int len = parent ? 10 : 5; int type = 1; struct orangefs_object_kref refn; if (*max_len < len) { gossip_err("fh buffer is too small for encoding\n"); *max_len = len; type = 255; goto out; } refn = ORANGEFS_I(inode)->refn; ORANGEFS_khandle_to(&refn.khandle, fh, 16); fh[4] = refn.fs_id; gossip_debug(GOSSIP_SUPER_DEBUG, "Encoding fh: handle %pU, fsid %u\n", &refn.khandle, refn.fs_id); if (parent) { refn = ORANGEFS_I(parent)->refn; ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16); fh[9] = refn.fs_id; type = 2; gossip_debug(GOSSIP_SUPER_DEBUG, "Encoding parent: handle %pU, fsid %u\n", &refn.khandle, refn.fs_id); } *max_len = len; out: return type; } static const struct export_operations orangefs_export_ops = { .encode_fh = orangefs_encode_fh, .fh_to_dentry = orangefs_fh_to_dentry, }; static int orangefs_unmount(int id, __s32 fs_id, const char *devname) { struct orangefs_kernel_op_s *op; int r; op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); if (!op) return -ENOMEM; op->upcall.req.fs_umount.id = id; op->upcall.req.fs_umount.fs_id = fs_id; strscpy(op->upcall.req.fs_umount.orangefs_config_server, devname); r = service_operation(op, "orangefs_fs_umount", 0); /* Not much to do about an error here. */ if (r) gossip_err("orangefs_unmount: service_operation %d\n", r); op_release(op); return r; } static int orangefs_fill_sb(struct super_block *sb, struct fs_context *fc, struct orangefs_fs_mount_response *fs_mount) { int ret; struct inode *root; struct dentry *root_dentry; struct orangefs_object_kref root_object; ORANGEFS_SB(sb)->sb = sb; ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle; ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id; ORANGEFS_SB(sb)->id = fs_mount->id; /* Hang the xattr handlers off the superblock */ sb->s_xattr = orangefs_xattr_handlers; sb->s_magic = ORANGEFS_SUPER_MAGIC; sb->s_op = &orangefs_s_ops; sb->s_d_op = &orangefs_dentry_operations; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; ret = super_setup_bdi(sb); if (ret) return ret; root_object.khandle = ORANGEFS_SB(sb)->root_khandle; root_object.fs_id = ORANGEFS_SB(sb)->fs_id; gossip_debug(GOSSIP_SUPER_DEBUG, "get inode %pU, fsid %d\n", &root_object.khandle, root_object.fs_id); root = orangefs_iget(sb, &root_object); if (IS_ERR(root)) return PTR_ERR(root); gossip_debug(GOSSIP_SUPER_DEBUG, "Allocated root inode [%p] with mode %x\n", root, root->i_mode); /* allocates and places root dentry in dcache */ root_dentry = d_make_root(root); if (!root_dentry) return -ENOMEM; sb->s_export_op = &orangefs_export_ops; sb->s_root = root_dentry; return 0; } static int orangefs_get_tree(struct fs_context *fc) { int ret; struct super_block *sb = ERR_PTR(-EINVAL); struct orangefs_kernel_op_s *new_op; if (!fc->source) return invalf(fc, "Device name not specified.\n"); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: called with devname %s\n", fc->source); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) return -ENOMEM; strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, fc->source); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Mount via host %s\n", new_op->upcall.req.fs_mount.orangefs_config_server); ret = service_operation(new_op, "orangefs_mount", 0); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: mount got return value of %d\n", ret); if (ret) goto free_op; if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) { gossip_err("ERROR: Retrieved null fs_id\n"); ret = -EINVAL; goto free_op; } sb = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(sb)) { ret = PTR_ERR(sb); orangefs_unmount(new_op->downcall.resp.fs_mount.id, new_op->downcall.resp.fs_mount.fs_id, fc->source); goto free_op; } /* init our private orangefs sb info */ ret = orangefs_fill_sb(sb, fc, &new_op->downcall.resp.fs_mount); if (ret) goto free_sb_and_op; /* * on successful mount, store the devname and data * used */ strscpy(ORANGEFS_SB(sb)->devname, fc->source); /* mount_pending must be cleared */ ORANGEFS_SB(sb)->mount_pending = 0; /* * finally, add this sb to our list of known orangefs * sb's */ gossip_debug(GOSSIP_SUPER_DEBUG, "Adding SB %p to orangefs superblocks\n", ORANGEFS_SB(sb)); spin_lock(&orangefs_superblocks_lock); list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks); spin_unlock(&orangefs_superblocks_lock); op_release(new_op); /* Must be removed from the list now. */ ORANGEFS_SB(sb)->no_list = 0; if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", 0); orangefs_features = new_op->downcall.resp.features.features; op_release(new_op); } else { orangefs_features = 0; } fc->root = dget(sb->s_root); return 0; free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ ORANGEFS_SB(sb)->no_list = 1; /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); if (ret == -EINVAL) { gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n"); gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n"); } op_release(new_op); return ret; } static void orangefs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations orangefs_context_ops = { .free = orangefs_free_fc, .parse_param = orangefs_parse_param, .get_tree = orangefs_get_tree, .reconfigure = orangefs_reconfigure, }; /* * Set up the filesystem mount context. */ int orangefs_init_fs_context(struct fs_context *fc) { struct orangefs_sb_info_s *osi; osi = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); if (!osi) return -ENOMEM; /* * Force any potential flags that might be set from the mount * to zero, ie, initialize to unset. */ fc->sb_flags_mask &= ~SB_POSIXACL; osi->flags &= ~ORANGEFS_OPT_INTR; osi->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; fc->s_fs_info = osi; fc->ops = &orangefs_context_ops; return 0; } void orangefs_kill_sb(struct super_block *sb) { int r; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); /* provided sb cleanup */ kill_anon_super(sb); if (!ORANGEFS_SB(sb)) { mutex_lock(&orangefs_request_mutex); mutex_unlock(&orangefs_request_mutex); return; } /* * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock */ r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, ORANGEFS_SB(sb)->devname); if (!r) ORANGEFS_SB(sb)->mount_pending = 1; if (!ORANGEFS_SB(sb)->no_list) { /* remove the sb from our list of orangefs specific sb's */ spin_lock(&orangefs_superblocks_lock); /* not list_del_init */ __list_del_entry(&ORANGEFS_SB(sb)->list); ORANGEFS_SB(sb)->list.prev = NULL; spin_unlock(&orangefs_superblocks_lock); } /* * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us * gets completed before we free the dang thing. */ mutex_lock(&orangefs_request_mutex); mutex_unlock(&orangefs_request_mutex); /* free the orangefs superblock private data */ kfree(ORANGEFS_SB(sb)); } int orangefs_inode_cache_initialize(void) { orangefs_inode_cache = kmem_cache_create_usercopy( "orangefs_inode_cache", sizeof(struct orangefs_inode_s), 0, 0, offsetof(struct orangefs_inode_s, link_target), sizeof_field(struct orangefs_inode_s, link_target), orangefs_inode_cache_ctor); if (!orangefs_inode_cache) { gossip_err("Cannot create orangefs_inode_cache\n"); return -ENOMEM; } return 0; } int orangefs_inode_cache_finalize(void) { kmem_cache_destroy(orangefs_inode_cache); return 0; } |
| 22 22 22 8 8 8 7 6 6 6 3 6 6 6 18 15 30 24 23 9 7 7 24 22 22 22 22 22 16 16 1 17 11 17 17 17 16 16 16 16 15 17 17 6 9 17 13 7 7 7 7 7 3 3 4 4 3 2 8 1 5 5 5 4 1 1 1 7 7 8 8 7 7 6 5 2 8 5 4 5 3 3 3 3 3 3 17 11 17 17 4 3 5 5 4 1 4 17 17 11 17 17 5 5 17 9 9 3 18 18 18 18 9 18 7 9 6 45 24 17 13 13 24 17 8 8 8 5 5 5 5 3 3 3 17 17 4 17 17 5 16 10 6 16 16 16 16 16 16 17 5 5 5 1 1 1 1 6 4 4 4 2 4 2 2 3 5 7 2 1 3 3 2 2 1 2 3 3 3 3 3 2 2 2 2 29 3 29 3 3 3 29 3 29 8 2 8 2 2 2 8 8 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 2 1 2 2 2 2 2 26 2 3 4 4 4 22 22 22 22 22 22 22 21 22 22 22 22 22 18 9 9 9 18 18 18 18 18 6 6 3 3 6 5 3 3 3 6 2 6 6 2 2 6 6 22 22 22 22 22 829 653 653 832 816 466 831 22 22 829 828 835 340 826 16 16 16 60 817 821 69 4 70 70 2 5 69 47 13 47 36 1 2 47 117 117 70 47 116 117 107 41 617 1 4 617 616 616 616 616 616 616 2 2 2 21 21 21 21 21 21 617 617 617 616 616 2 3 617 21 17 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 | // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_policy.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor * */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> #include <linux/rhashtable.h> #include <linux/if_tunnel.h> #include <linux/icmp.h> #include <net/dst.h> #include <net/flow.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/gre.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #ifdef CONFIG_XFRM_ESPINTCP #include <net/espintcp.h> #endif #include <net/inet_dscp.h> #include "xfrm_hash.h" #define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10)) #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 struct xfrm_flo { struct dst_entry *dst_orig; u8 flags; }; /* prefixes smaller than this are stored in lists, not trees. */ #define INEXACT_PREFIXLEN_IPV4 16 #define INEXACT_PREFIXLEN_IPV6 48 struct xfrm_pol_inexact_node { struct rb_node node; union { xfrm_address_t addr; struct rcu_head rcu; }; u8 prefixlen; struct rb_root root; /* the policies matching this node, can be empty list */ struct hlist_head hhead; }; /* xfrm inexact policy search tree: * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); * | * +---- root_d: sorted by daddr:prefix * | | * | xfrm_pol_inexact_node * | | * | +- root: sorted by saddr/prefix * | | | * | | xfrm_pol_inexact_node * | | | * | | + root: unused * | | | * | | + hhead: saddr:daddr policies * | | * | +- coarse policies and all any:daddr policies * | * +---- root_s: sorted by saddr:prefix * | | * | xfrm_pol_inexact_node * | | * | + root: unused * | | * | + hhead: saddr:any policies * | * +---- coarse policies and all any:any policies * * Lookups return four candidate lists: * 1. any:any list from top-level xfrm_pol_inexact_bin * 2. any:daddr list from daddr tree * 3. saddr:daddr list from 2nd level daddr tree * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with * the lowest priority. If two candidates have the same priority, the * struct xfrm_policy pos member with the lower number is used. * * This replicates previous single-list-search algorithm which would * return first matching policy in the (ordered-by-priority) list. */ struct xfrm_pol_inexact_key { possible_net_t net; u32 if_id; u16 family; u8 dir, type; }; struct xfrm_pol_inexact_bin { struct xfrm_pol_inexact_key k; struct rhash_head head; /* list containing '*:*' policies */ struct hlist_head hhead; seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; /* tree sorted by saddr/prefix */ struct rb_root root_s; /* slow path below */ struct list_head inexact_bins; struct rcu_head rcu; }; enum xfrm_pol_inexact_candidate_type { XFRM_POL_CAND_BOTH, XFRM_POL_CAND_SADDR, XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, XFRM_POL_CAND_MAX, }; struct xfrm_pol_inexact_candidates { struct hlist_head *res[XFRM_POL_CAND_MAX]; }; struct xfrm_flow_keys { struct flow_dissector_key_basic basic; struct flow_dissector_key_control control; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; } addrs; struct flow_dissector_key_ip ip; struct flow_dissector_key_icmp icmp; struct flow_dissector_key_ports ports; struct flow_dissector_key_keyid gre; }; static struct flow_dissector xfrm_session_dissector __ro_after_init; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr); static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); } static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) && addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && (fl4->flowi4_proto == sel->proto || !sel->proto) && (fl4->flowi4_oif == sel->ifindex || !sel->ifindex); } static inline bool __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) && addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) && (fl6->flowi6_proto == sel->proto || !sel->proto) && (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_selector_match(sel, fl); case AF_INET6: return __xfrm6_selector_match(sel, fl); } return false; } static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) { const struct xfrm_policy_afinfo *afinfo; if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_policy_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } /* Called with rcu_read_lock(). */ static const struct xfrm_if_cb *xfrm_if_get_cb(void) { return rcu_dereference(xfrm_if_cb); } struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); dst = afinfo->dst_lookup(params); rcu_read_unlock(); return dst; } EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, dscp_t dscp, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) { struct xfrm_dst_lookup_params params; struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; struct dst_entry *dst; if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) { saddr = x->coaddr; daddr = prev_daddr; } if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) { saddr = prev_saddr; daddr = x->coaddr; } params.net = net; params.saddr = saddr; params.daddr = daddr; params.dscp = dscp; params.oif = oif; params.mark = mark; params.ipproto = x->id.proto; if (x->encap) { switch (x->encap->encap_type) { case UDP_ENCAP_ESPINUDP: params.ipproto = IPPROTO_UDP; params.uli.ports.sport = x->encap->encap_sport; params.uli.ports.dport = x->encap->encap_dport; break; case TCP_ENCAP_ESPINTCP: params.ipproto = IPPROTO_TCP; params.uli.ports.sport = x->encap->encap_sport; params.uli.ports.dport = x->encap->encap_dport; break; } } dst = __xfrm_dst_lookup(family, ¶ms); if (!IS_ERR(dst)) { if (prev_saddr != saddr) memcpy(prev_saddr, saddr, sizeof(*prev_saddr)); if (prev_daddr != daddr) memcpy(prev_daddr, daddr, sizeof(*prev_daddr)); } return dst; } static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) return MAX_SCHEDULE_TIMEOUT-1; else return secs*HZ; } static void xfrm_policy_timer(struct timer_list *t) { struct xfrm_policy *xp = from_timer(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int dir; read_lock(&xp->lock); if (unlikely(xp->walk.dead)) goto out; dir = xfrm_policy_id2dir(xp->index); if (xp->lft.hard_add_expires_seconds) { time64_t tmo = xp->lft.hard_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.soft_add_expires_seconds) { time64_t tmo = xp->lft.soft_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (warn) km_policy_expired(xp, dir, 0, 0); if (next != TIME64_MAX && !mod_timer(&xp->timer, jiffies + make_jiffies(next))) xfrm_pol_hold(xp); out: read_unlock(&xp->lock); xfrm_pol_put(xp); return; expired: read_unlock(&xp->lock); if (!xfrm_policy_delete(xp, dir)) km_policy_expired(xp, dir, 1, 0); xfrm_pol_put(xp); } /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) { struct xfrm_policy *policy; policy = kzalloc(sizeof(struct xfrm_policy), gfp); if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); INIT_HLIST_HEAD(&policy->state_cache_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); timer_setup(&policy->timer, xfrm_policy_timer, 0); timer_setup(&policy->polq.hold_timer, xfrm_policy_queue_process, 0); } return policy; } EXPORT_SYMBOL(xfrm_policy_alloc); static void xfrm_policy_destroy_rcu(struct rcu_head *head) { struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); security_xfrm_policy_free(policy->security); kfree(policy); } /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); if (timer_delete(&policy->timer) || timer_delete(&policy->polq.hold_timer)) BUG(); xfrm_dev_policy_free(policy); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); /* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ static void xfrm_policy_kill(struct xfrm_policy *policy) { struct net *net = xp_net(policy); struct xfrm_state *x; xfrm_dev_policy_delete(policy); write_lock_bh(&policy->lock); policy->walk.dead = 1; write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); if (timer_delete(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); if (timer_delete(&policy->timer)) xfrm_pol_put(policy); /* XXX: Flush state cache */ spin_lock_bh(&net->xfrm.xfrm_state_lock); hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) { hlist_del_init_rcu(&x->state_cache); } spin_unlock_bh(&net->xfrm.xfrm_state_lock); xfrm_pol_put(policy); } static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; static inline unsigned int idx_hash(struct net *net, u32 index) { return __idx_hash(index, net->xfrm.policy_idx_hmask); } /* calculate policy hash thresholds */ static void __get_hash_thresh(struct net *net, unsigned short family, int dir, u8 *dbits, u8 *sbits) { switch (family) { case AF_INET: *dbits = net->xfrm.policy_bydst[dir].dbits4; *sbits = net->xfrm.policy_bydst[dir].sbits4; break; case AF_INET6: *dbits = net->xfrm.policy_bydst[dir].dbits6; *sbits = net->xfrm.policy_bydst[dir].sbits6; break; default: *dbits = 0; *sbits = 0; } } static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, struct hlist_head *list, struct hlist_head *ndsttable, unsigned int nhashmask, int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; u8 dbits; u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; hlist_del_rcu(&pol->bydst); hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } if (!hlist_empty(list)) { entry0 = NULL; goto redo; } } static void xfrm_idx_hash_transfer(struct hlist_head *list, struct hlist_head *nidxtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_policy *pol; hlist_for_each_entry_safe(pol, tmp, list, byidx) { unsigned int h; h = __idx_hash(pol->index, nhashmask); hlist_add_head(&pol->byidx, nidxtable+h); } } static unsigned long xfrm_new_hash_mask(unsigned int old_hmask) { return ((old_hmask + 1) << 1) - 1; } static void xfrm_bydst_resize(struct net *net, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *ndst = xfrm_hash_alloc(nsize); struct hlist_head *odst; int i; if (!ndst) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } static void xfrm_byidx_resize(struct net *net) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *oidx = net->xfrm.policy_byidx; struct hlist_head *nidx = xfrm_hash_alloc(nsize); int i; if (!nidx) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total) { unsigned int cnt = net->xfrm.policy_count[dir]; unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; if (total) *total += cnt; if ((hmask + 1) < xfrm_policy_hashmax && cnt > hmask) return 1; return 0; } static inline int xfrm_byidx_should_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; if ((hmask + 1) < xfrm_policy_hashmax && total > hmask) return 1; return 0; } void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; } EXPORT_SYMBOL(xfrm_spd_getinfo); static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hash_work); int dir, total; mutex_lock(&hash_resize_mutex); total = 0; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if (xfrm_bydst_should_resize(net, dir, &total)) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) xfrm_byidx_resize(net); mutex_unlock(&hash_resize_mutex); } /* Make sure *pol can be inserted into fastbin. * Useful to check that later insert requests will be successful * (provided xfrm_policy_lock is held throughout). */ static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) { struct xfrm_pol_inexact_bin *bin, *prev; struct xfrm_pol_inexact_key k = { .family = pol->family, .type = pol->type, .dir = dir, .if_id = pol->if_id, }; struct net *net = xp_net(pol); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); write_pnet(&k.net, net); bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); if (bin) return bin; bin = kzalloc(sizeof(*bin), GFP_ATOMIC); if (!bin) return NULL; bin->k = k; INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, xfrm_pol_inexact_params); if (!prev) { list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); return bin; } kfree(bin); return IS_ERR(prev) ? NULL : prev; } static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, int family, u8 prefixlen) { if (xfrm_addr_any(addr, family)) return true; if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) return true; if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) return true; return false; } static bool xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) { const xfrm_address_t *addr; bool saddr_any, daddr_any; u8 prefixlen; addr = &policy->selector.saddr; prefixlen = policy->selector.prefixlen_s; saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); addr = &policy->selector.daddr; prefixlen = policy->selector.prefixlen_d; daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); return saddr_any && daddr_any; } static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, const xfrm_address_t *addr, u8 prefixlen) { node->addr = *addr; node->prefixlen = prefixlen; } static struct xfrm_pol_inexact_node * xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) { struct xfrm_pol_inexact_node *node; node = kzalloc(sizeof(*node), GFP_ATOMIC); if (node) xfrm_pol_inexact_node_init(node, addr, prefixlen); return node; } static int xfrm_policy_addr_delta(const xfrm_address_t *a, const xfrm_address_t *b, u8 prefixlen, u16 family) { u32 ma, mb, mask; unsigned int pdw, pbi; int delta = 0; switch (family) { case AF_INET: if (prefixlen == 0) return 0; mask = ~0U << (32 - prefixlen); ma = ntohl(a->a4) & mask; mb = ntohl(b->a4) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; break; case AF_INET6: pdw = prefixlen >> 5; pbi = prefixlen & 0x1f; if (pdw) { delta = memcmp(a->a6, b->a6, pdw << 2); if (delta) return delta; } if (pbi) { mask = ~0U << (32 - pbi); ma = ntohl(a->a6[pdw]) & mask; mb = ntohl(b->a6[pdw]) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; } break; default: break; } return delta; } static void xfrm_policy_inexact_list_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, u16 family) { unsigned int matched_s, matched_d; struct xfrm_policy *policy, *p; matched_s = 0; matched_d = 0; list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { struct hlist_node *newpos = NULL; bool matches_s, matches_d; if (policy->walk.dead || !policy->bydst_reinsert) continue; WARN_ON_ONCE(policy->family != family); policy->bydst_reinsert = false; hlist_for_each_entry(p, &n->hhead, bydst) { if (policy->priority > p->priority) newpos = &p->bydst; else if (policy->priority == p->priority && policy->pos > p->pos) newpos = &p->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, &n->hhead); /* paranoia checks follow. * Check that the reinserted policy matches at least * saddr or daddr for current node prefix. * * Matching both is fine, matching saddr in one policy * (but not daddr) and then matching only daddr in another * is a bug. */ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, &n->addr, n->prefixlen, family) == 0; matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, &n->addr, n->prefixlen, family) == 0; if (matches_s && matches_d) continue; WARN_ON_ONCE(!matches_s && !matches_d); if (matches_s) matched_s++; if (matches_d) matched_d++; WARN_ON_ONCE(matched_s && matched_d); } } static void xfrm_policy_inexact_node_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, struct rb_root *new, u16 family) { struct xfrm_pol_inexact_node *node; struct rb_node **p, *parent; /* we should not have another subtree here */ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); restart: parent = NULL; p = &new->rb_node; while (*p) { u8 prefixlen; int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); prefixlen = min(node->prefixlen, n->prefixlen); delta = xfrm_policy_addr_delta(&n->addr, &node->addr, prefixlen, family); if (delta < 0) { p = &parent->rb_left; } else if (delta > 0) { p = &parent->rb_right; } else { bool same_prefixlen = node->prefixlen == n->prefixlen; struct xfrm_policy *tmp; hlist_for_each_entry(tmp, &n->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } node->prefixlen = prefixlen; xfrm_policy_inexact_list_reinsert(net, node, family); if (same_prefixlen) { kfree_rcu(n, rcu); return; } rb_erase(*p, new); kfree_rcu(n, rcu); n = node; goto restart; } } rb_link_node_rcu(&n->node, parent, p); rb_insert_color(&n->node, new); } /* merge nodes v and n */ static void xfrm_policy_inexact_node_merge(struct net *net, struct xfrm_pol_inexact_node *v, struct xfrm_pol_inexact_node *n, u16 family) { struct xfrm_pol_inexact_node *node; struct xfrm_policy *tmp; struct rb_node *rnode; /* To-be-merged node v has a subtree. * * Dismantle it and insert its nodes to n->root. */ while ((rnode = rb_first(&v->root)) != NULL) { node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); rb_erase(&node->node, &v->root); xfrm_policy_inexact_node_reinsert(net, node, &n->root, family); } hlist_for_each_entry(tmp, &v->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } xfrm_policy_inexact_list_reinsert(net, n, family); } static struct xfrm_pol_inexact_node * xfrm_policy_inexact_insert_node(struct net *net, struct rb_root *root, xfrm_address_t *addr, u16 family, u8 prefixlen, u8 dir) { struct xfrm_pol_inexact_node *cached = NULL; struct rb_node **p, *parent = NULL; struct xfrm_pol_inexact_node *node; p = &root->rb_node; while (*p) { int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta == 0 && prefixlen >= node->prefixlen) { WARN_ON_ONCE(cached); /* ipsec policies got lost */ return node; } if (delta < 0) p = &parent->rb_left; else p = &parent->rb_right; if (prefixlen < node->prefixlen) { delta = xfrm_policy_addr_delta(addr, &node->addr, prefixlen, family); if (delta) continue; /* This node is a subnet of the new prefix. It needs * to be removed and re-inserted with the smaller * prefix and all nodes that are now also covered * by the reduced prefixlen. */ rb_erase(&node->node, root); if (!cached) { xfrm_pol_inexact_node_init(node, addr, prefixlen); cached = node; } else { /* This node also falls within the new * prefixlen. Merge the to-be-reinserted * node and this one. */ xfrm_policy_inexact_node_merge(net, node, cached, family); kfree_rcu(node, rcu); } /* restart */ p = &root->rb_node; parent = NULL; } } node = cached; if (!node) { node = xfrm_pol_inexact_node_alloc(addr, prefixlen); if (!node) return NULL; } rb_link_node_rcu(&node->node, parent, p); rb_insert_color(&node->node, root); return node; } static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) { struct xfrm_pol_inexact_node *node; struct rb_node *rn = rb_first(r); while (rn) { node = rb_entry(rn, struct xfrm_pol_inexact_node, node); xfrm_policy_inexact_gc_tree(&node->root, rm); rn = rb_next(rn); if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { WARN_ON_ONCE(rm); continue; } rb_erase(&node->node, r); kfree_rcu(node, rcu); } } static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) { write_seqcount_begin(&b->count); xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); write_seqcount_end(&b->count); if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || !hlist_empty(&b->hhead)) { WARN_ON_ONCE(net_exit); return; } if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, xfrm_pol_inexact_params) == 0) { list_del(&b->inexact_bins); kfree_rcu(b, rcu); } } static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) { struct net *net = read_pnet(&b->k.net); spin_lock_bh(&net->xfrm.xfrm_policy_lock); __xfrm_policy_inexact_prune_bin(b, false); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static void __xfrm_policy_inexact_flush(struct net *net) { struct xfrm_pol_inexact_bin *bin, *t; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(bin, false); } static struct hlist_head * xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, struct xfrm_policy *policy, u8 dir) { struct xfrm_pol_inexact_node *n; struct net *net; net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); if (xfrm_policy_inexact_insert_use_any_list(policy)) return &bin->hhead; if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, policy->family, policy->selector.prefixlen_d)) { write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_s, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } /* daddr is fixed */ write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_d, &policy->selector.daddr, policy->family, policy->selector.prefixlen_d, dir); write_seqcount_end(&bin->count); if (!n) return NULL; /* saddr is wildcard */ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, policy->family, policy->selector.prefixlen_s)) return &n->hhead; write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &n->root, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } static struct xfrm_policy * xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) { struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *delpol; struct hlist_head *chain; struct net *net; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) return ERR_PTR(-ENOMEM); net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); if (!chain) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-ENOMEM); } delpol = xfrm_policy_insert_list(chain, policy, excl); if (delpol && excl) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-EEXIST); } if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); return delpol; } static bool xfrm_policy_is_dead_or_sk(const struct xfrm_policy *policy) { int dir; if (policy->walk.dead) return true; dir = xfrm_policy_id2dir(policy->index); return dir >= XFRM_POLICY_MAX; } static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; struct hlist_node *newpos; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; mutex_lock(&hash_resize_mutex); /* read selector prefixlen thresholds */ do { seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); lbits4 = net->xfrm.policy_hthresh.lbits4; rbits4 = net->xfrm.policy_hthresh.rbits4; lbits6 = net->xfrm.policy_hthresh.lbits6; rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; if (xfrm_policy_is_dead_or_sk(policy)) continue; dir = xfrm_policy_id2dir(policy->index); if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; sbits = lbits4; } else { dbits = rbits6; sbits = lbits6; } } else { if (policy->family == AF_INET) { dbits = lbits4; sbits = rbits4; } else { dbits = lbits6; sbits = rbits6; } } if (policy->selector.prefixlen_d < dbits || policy->selector.prefixlen_s < sbits) continue; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) goto out_unlock; if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) goto out_unlock; } for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; net->xfrm.policy_bydst[dir].sbits4 = lbits4; net->xfrm.policy_bydst[dir].dbits6 = rbits6; net->xfrm.policy_bydst[dir].sbits6 = lbits6; } else { /* dir in/fwd => dst = local, src = remote */ net->xfrm.policy_bydst[dir].dbits4 = lbits4; net->xfrm.policy_bydst[dir].sbits4 = rbits4; net->xfrm.policy_bydst[dir].dbits6 = lbits6; net->xfrm.policy_bydst[dir].sbits6 = rbits6; } } /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (xfrm_policy_is_dead_or_sk(policy)) continue; hlist_del_rcu(&policy->bydst); newpos = NULL; dir = xfrm_policy_id2dir(policy->index); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); continue; } hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; else break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); } out_unlock: __xfrm_policy_inexact_flush(net); write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } void xfrm_policy_hash_rebuild(struct net *net) { schedule_work(&net->xfrm.policy_hthresh.work); } EXPORT_SYMBOL(xfrm_policy_hash_rebuild); /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { for (;;) { struct hlist_head *list; struct xfrm_policy *p; u32 idx; int found; if (!index) { idx = (net->xfrm.idx_generator | dir); net->xfrm.idx_generator += 8; } else { idx = index; index = 0; } if (idx == 0) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); found = 0; hlist_for_each_entry(p, list, byidx) { if (p->index == idx) { found = 1; break; } } if (!found) return idx; } } static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2) { u32 *p1 = (u32 *) s1; u32 *p2 = (u32 *) s2; int len = sizeof(struct xfrm_selector) / sizeof(u32); int i; for (i = 0; i < len; i++) { if (p1[i] != p2[i]) return 1; } return 0; } static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy *new) { struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; if (skb_queue_empty(&pq->hold_queue)) return; __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); if (timer_delete(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice(&list, &pq->hold_queue); pq->timeout = XFRM_QUEUE_TMO_MIN; if (!mod_timer(&pq->hold_timer, jiffies)) xfrm_pol_hold(new); spin_unlock_bh(&pq->hold_queue.lock); } static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark, struct xfrm_policy *pol) { return mark->v == pol->mark.v && mark->m == pol->mark.m; } static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_key *k = data; u32 a = k->type << 24 | k->dir << 16 | k->family; return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), seed); } static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_bin *b = data; return xfrm_pol_bin_key(&b->k, 0, seed); } static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct xfrm_pol_inexact_key *key = arg->key; const struct xfrm_pol_inexact_bin *b = ptr; int ret; if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) return -1; ret = b->k.dir ^ key->dir; if (ret) return ret; ret = b->k.type ^ key->type; if (ret) return ret; ret = b->k.family ^ key->family; if (ret) return ret; return b->k.if_id ^ key->if_id; } static const struct rhashtable_params xfrm_pol_inexact_params = { .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), .hashfn = xfrm_pol_bin_key, .obj_hashfn = xfrm_pol_bin_obj, .obj_cmpfn = xfrm_pol_bin_cmp, .automatic_shrinking = true, }; static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) { struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = pol; continue; } if (delpol) break; } if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else /* Packet offload policies enter to the head * to speed-up lookups. */ hlist_add_head_rcu(&policy->bydst, chain); return delpol; } int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) { struct net *net = xp_net(policy); struct xfrm_policy *delpol; struct hlist_head *chain; /* Sanitize mark before store */ policy->mark.v &= policy->mark.m; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) delpol = xfrm_policy_insert_list(chain, policy, excl); else delpol = xfrm_policy_inexact_insert(policy, dir, excl); if (IS_ERR(delpol)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return PTR_ERR(delpol); } __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) rt_genid_bump_ipv4(net); else rt_genid_bump_ipv6(net); if (delpol) { xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = ktime_get_real_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); else if (xfrm_bydst_should_resize(net, dir, NULL)) schedule_work(&net->xfrm.policy_hash_work); return 0; } EXPORT_SYMBOL(xfrm_policy_insert); static struct xfrm_policy * __xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx) { struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol) && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) return pol; } return NULL; } struct xfrm_policy * xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { struct xfrm_pol_inexact_bin *bin = NULL; struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); if (!chain) { struct xfrm_pol_inexact_candidates cand; int i; bin = xfrm_policy_inexact_lookup(net, type, sel->family, dir, if_id); if (!bin) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } if (!xfrm_policy_find_inexact_candidates(&cand, bin, &sel->saddr, &sel->daddr)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } pol = NULL; for (i = 0; i < ARRAY_SIZE(cand.res); i++) { struct xfrm_policy *tmp; tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, if_id, type, dir, sel, ctx); if (!tmp) continue; if (!pol || tmp->pos < pol->pos) pol = tmp; } } else { pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, sel, ctx); } if (pol) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete(pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); if (bin && delete) xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy * xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; *err = -ENOENT; if (xfrm_policy_id2dir(id) != dir) return NULL; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; break; } } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); return ret; } EXPORT_SYMBOL(xfrm_policy_byid); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->type != type) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { return 0; } #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead) continue; dir = xfrm_policy_id2dir(pol->index); if (dir >= XFRM_POLICY_MAX || pol->type != type) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead) continue; dir = xfrm_policy_id2dir(pol->index); if (dir >= XFRM_POLICY_MAX || pol->xdo.dev != dev) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_dev_policy_flush); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) { struct xfrm_policy *pol; struct xfrm_policy_walk_entry *x; int error = 0; if (walk->type >= XFRM_POLICY_TYPE_MAX && walk->type != XFRM_POLICY_TYPE_ANY) return -EINVAL; if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else x = list_first_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; pol = container_of(x, struct xfrm_policy, walk); if (walk->type != XFRM_POLICY_TYPE_ANY && walk->type != pol->type) continue; error = func(pol, xfrm_policy_id2dir(pol->index), walk->seq, data); if (error) { list_move_tail(&walk->walk.all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { error = -ENOENT; goto out; } list_del_init(&walk->walk.all); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) { INIT_LIST_HEAD(&walk->walk.all); walk->walk.dead = 1; walk->type = type; walk->seq = 0; } EXPORT_SYMBOL(xfrm_policy_walk_init); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) { if (list_empty(&walk->walk.all)) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); /* * Find policy to apply to this flow. * * Returns 0 if policy found, else an -errno. */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; match = xfrm_selector_match(sel, fl, family); if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); return ret; } static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; int seq; again: seq = read_seqcount_begin(count); parent = rcu_dereference_raw(r->rb_node); while (parent) { struct xfrm_pol_inexact_node *node; int delta; node = rb_entry(parent, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta < 0) { parent = rcu_dereference_raw(parent->rb_left); continue; } else if (delta > 0) { parent = rcu_dereference_raw(parent->rb_right); continue; } return node; } if (read_seqcount_retry(count, seq)) goto again; return NULL; } static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct xfrm_pol_inexact_node *n; u16 family; if (!b) return false; family = b->k.family; memset(cand, 0, sizeof(*cand)); cand->res[XFRM_POL_CAND_ANY] = &b->hhead; n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, family); if (n) { cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; } n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; return true; } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_key k = { .family = family, .type = type, .dir = dir, .if_id = if_id, }; write_pnet(&k.net, net); return rhashtable_lookup(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_bin *bin; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); rcu_read_lock(); bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); rcu_read_unlock(); return bin; } static struct xfrm_policy * __xfrm_policy_eval_candidates(struct hlist_head *chain, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { u32 priority = prefer ? prefer->priority : ~0u; struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { int err; if (pol->priority > priority) break; err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err != -ESRCH) return ERR_PTR(err); continue; } if (prefer) { /* matches. Is it older than *prefer? */ if (pol->priority == priority && prefer->pos < pol->pos) return prefer; } return pol; } return NULL; } static struct xfrm_policy * xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, u32 if_id) { struct xfrm_policy *tmp; int i; for (i = 0; i < ARRAY_SIZE(cand->res); i++) { tmp = __xfrm_policy_eval_candidates(cand->res[i], prefer, fl, type, family, if_id); if (!tmp) continue; if (IS_ERR(tmp)) return tmp; prefer = tmp; } return prefer; } static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; rcu_read_lock(); retry: do { sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err == -ESRCH) continue; else { ret = ERR_PTR(err); goto fail; } } else { ret = pol; break; } } if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET) goto skip_inexact; bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) goto skip_inexact; pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, family, if_id); if (pol) { ret = pol; if (IS_ERR(pol)) goto fail; } skip_inexact: if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) goto retry; fail: rcu_read_unlock(); return ret; } static struct xfrm_policy *xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir, if_id); if (pol != NULL) return pol; #endif return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family, u32 if_id) { struct xfrm_policy *pol; rcu_read_lock(); again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match; int err = 0; if (pol->family != family) { pol = NULL; goto out; } match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; } else if (err == -ESRCH) { pol = NULL; } else { pol = ERR_PTR(err); } } else pol = NULL; } out: rcu_read_unlock(); return pol; } static u32 xfrm_gen_pos_slow(struct net *net) { struct xfrm_policy *policy; u32 i = 0; /* oldest entry is last in list */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (!xfrm_policy_is_dead_or_sk(policy)) policy->pos = ++i; } return i; } static u32 xfrm_gen_pos(struct net *net) { const struct xfrm_policy *policy; u32 i = 0; /* most recently added policy is at the head of the list */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { if (xfrm_policy_is_dead_or_sk(policy)) continue; if (policy->pos == UINT_MAX) return xfrm_gen_pos_slow(net); i = policy->pos + 1; break; } return i; } static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_FWD: case XFRM_POLICY_OUT: pol->pos = xfrm_gen_pos(net); break; } list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); } static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); if (list_empty(&pol->walk.all)) return NULL; /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); hlist_del(&pol->byidx); } list_del_init(&pol->walk.all); net->xfrm.policy_count[dir]--; return pol; } static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir) { __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir); } static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir) { __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir); } int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; } return -ENOENT; } EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY if (pol && pol->type != XFRM_POLICY_TYPE_MAIN) return -EINVAL; #endif spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = ktime_get_real_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ xfrm_sk_policy_unlink(old_pol, dir); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); } return 0; } static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) { struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); struct net *net = xp_net(old); if (newp) { newp->selector = old->selector; if (security_xfrm_policy_clone(old->security, &newp->security)) { kfree(newp); return NULL; /* ENOMEM */ } newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { const struct xfrm_policy *p; struct xfrm_policy *np; int i, ret = 0; rcu_read_lock(); for (i = 0; i < 2; i++) { p = rcu_dereference(osk->sk_policy[i]); if (p) { np = clone_policy(p, i); if (unlikely(!np)) { ret = -ENOMEM; break; } rcu_assign_pointer(sk->sk_policy[i], np); } } rcu_read_unlock(); return ret; } static int xfrm_get_saddr(unsigned short family, xfrm_address_t *saddr, const struct xfrm_dst_lookup_params *params) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; err = afinfo->get_saddr(saddr, params); rcu_read_unlock(); return err; } /* Resolve list of templates for the flow, given policy. */ static int xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct net *net = xp_net(policy); int nx; int i, error; xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; xfrm_address_t *remote = daddr; xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || tmpl->mode == XFRM_MODE_IPTFS || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { struct xfrm_dst_lookup_params params; memset(¶ms, 0, sizeof(params)); params.net = net; params.oif = fl->flowi_oif; params.daddr = remote; error = xfrm_get_saddr(tmpl->encap_family, &tmp, ¶ms); if (error) goto fail; local = &tmp; } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR); xfrm_state_put(x); error = -EINVAL; goto fail; } if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; daddr = remote; saddr = local; continue; } if (x) { error = (x->km.state == XFRM_STATE_ERROR ? -EINVAL : -EAGAIN); xfrm_state_put(x); } else if (error == -ESRCH) { error = -EAGAIN; } if (!tmpl->optional) goto fail; } return nx; fail: for (nx--; nx >= 0; nx--) xfrm_state_put(xfrm[nx]); return error; } static int xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct xfrm_state *tp[XFRM_MAX_DEPTH]; struct xfrm_state **tpp = (npols > 1) ? tp : xfrm; int cnx = 0; int error; int ret; int i; for (i = 0; i < npols; i++) { if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) { error = -ENOBUFS; goto fail; } ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family); if (ret < 0) { error = ret; goto fail; } else cnx += ret; } /* found states are sorted for outbound processing */ if (npols > 1) xfrm_state_sort(xfrm, tpp, cnx, family); return cnx; fail: for (cnx--; cnx >= 0; cnx--) xfrm_state_put(tpp[cnx]); return error; } static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos); return 0; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_ops *dst_ops; struct xfrm_dst *xdst; if (!afinfo) return ERR_PTR(-EINVAL); switch (family) { case AF_INET: dst_ops = &net->xfrm.xfrm4_dst_ops; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: dst_ops = &net->xfrm.xfrm6_dst_ops; break; #endif default: BUG(); } xdst = dst_alloc(dst_ops, NULL, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { memset_after(xdst, 0, u.dst); } else xdst = ERR_PTR(-ENOBUFS); rcu_read_unlock(); return xdst; } static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { path->path_cookie = rt6_get_cookie(dst_rt6_info(dst)); path->u.rt6.rt6i_nfheader_len = nfheader_len; } } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(xdst->u.dst.ops->family); int err; if (!afinfo) return -EINVAL; err = afinfo->fill_dst(xdst, dev, fl); rcu_read_unlock(); return err; } /* Allocate chain of dst_entry's, attach known xfrm's, calculate * all the metrics... Shortly, bundle a bundle. */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, struct xfrm_dst **bundle, int nx, const struct flowi *fl, struct dst_entry *dst) { const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; int nfheader_len = 0; int trailer_len = 0; int family = policy->selector.family; xfrm_address_t saddr, daddr; dscp_t dscp; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); dscp = xfrm_get_dscp(fl, family); dst_hold(dst); for (; i < nx; i++) { struct xfrm_dst *xdst = xfrm_alloc_dst(net, family); struct dst_entry *dst1 = &xdst->u.dst; err = PTR_ERR(xdst); if (IS_ERR(xdst)) { dst_release(dst); goto put_states; } bundle[i] = xdst; if (!xdst_prev) xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], xfrm_af2proto(family)); if (!inner_mode) { err = -EAFNOSUPPORT; dst_release(dst); goto put_states; } } else inner_mode = &xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { __u32 mark = 0; int oif; if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); if (xfrm[i]->xso.type != XFRM_DEV_OFFLOAD_PACKET) family = xfrm[i]->props.family; oif = fl->flowi_oif ? : fl->flowi_l3mdev; dst = xfrm_dst_lookup(xfrm[i], dscp, oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; } else dst_hold(dst); dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->lastuse = now; dst1->input = dst_discard; if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) { dst1->output = xfrm[i]->mode_cbs->output; } else { rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); if (likely(afinfo)) dst1->output = afinfo->output; else dst1->output = dst_discard_out; rcu_read_unlock(); } xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) nfheader_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; } xfrm_dst_set_child(xdst_prev, dst); xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; xfrm_init_path(xdst0, dst, nfheader_len); xfrm_init_pmtu(bundle, nx); for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; xdst_prev->u.dst.header_len = header_len; xdst_prev->u.dst.trailer_len = trailer_len; header_len -= xdst_prev->u.dst.xfrm->props.header_len; trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: if (xdst0) dst_release_immediate(&xdst0->u.dst); return ERR_PTR(err); } static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) { int i; if (*num_pols == 0 || !pols[0]) { *num_pols = 0; *num_xfrms = 0; return 0; } if (IS_ERR(pols[0])) { *num_pols = 0; return PTR_ERR(pols[0]); } *num_xfrms = pols[0]->xfrm_nr; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->action == XFRM_POLICY_ALLOW && pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, XFRM_POLICY_OUT, pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; (*num_xfrms) += pols[1]->xfrm_nr; } } #endif for (i = 0; i < *num_pols; i++) { if (pols[i]->action != XFRM_POLICY_ALLOW) { *num_xfrms = -1; break; } } return 0; } static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, struct dst_entry *dst_orig) { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst; struct dst_entry *dst; int err; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { if (err == 0) return NULL; if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); } xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); return xdst; } static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); if (!skb) { spin_unlock(&pq->hold_queue.lock); goto out; } dst = skb_dst(skb); sk = skb->sk; /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(net, skb, &fl, dst->ops->family); skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) goto purge_queue; if (dst->flags & DST_XFRM_QUEUE) { dst_release(dst); if (pq->timeout >= XFRM_QUEUE_TMO_MAX) goto purge_queue; pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); goto out; } dst_release(dst); __skb_queue_head_init(&list); spin_lock(&pq->hold_queue.lock); pq->timeout = 0; skb_queue_splice_init(&pq->hold_queue, &list); spin_unlock(&pq->hold_queue.lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(net, skb, &fl, skb_dst(skb)->ops->family); skb->mark = skb_mark; dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; } nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); dst_output(net, skb_to_full_sk(skb), skb); } out: xfrm_pol_put(pol); return; purge_queue: pq->timeout = 0; skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) { kfree_skb(skb); return -EAGAIN; } skb_dst_force(skb); spin_lock_bh(&pq->hold_queue.lock); if (!pq->timeout) pq->timeout = XFRM_QUEUE_TMO_MIN; sched_next = jiffies + pq->timeout; if (timer_delete(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); } __skb_queue_tail(&pq->hold_queue, skb); if (!mod_timer(&pq->hold_timer, sched_next)) xfrm_pol_hold(pol); spin_unlock_bh(&pq->hold_queue.lock); return 0; } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; xdst = xfrm_alloc_dst(net, family); if (IS_ERR(xdst)) return xdst; if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || net->xfrm.sysctl_larval_drop || num_xfrms <= 0) return xdst; dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; dst1->output = xdst_queue_output; dst_hold(dst); xfrm_dst_set_child(xdst, dst); xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; err = xfrm_fill_dst(xdst, dev, fl); if (err) goto free_dst; out: return xdst; free_dst: dst_release(dst1); xdst = ERR_PTR(err); goto out; } static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; struct xfrm_dst *xdst; /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto inc_error; if (num_pols == 0) return NULL; if (num_xfrms <= 0) goto make_dummy_bundle; xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { xfrm_pols_put(pols, num_pols); return NULL; } if (err != -EAGAIN) goto error; goto make_dummy_bundle; } else if (xdst == NULL) { num_xfrms = 0; goto make_dummy_bundle; } return xdst; make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); } xdst->num_pols = num_pols; xdst->num_xfrms = num_xfrms; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); return xdst; inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_entry *ret; if (!afinfo) { dst_release(dst_orig); return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } rcu_read_unlock(); return ret; } /* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. * * xfrm_lookup uses an if_id of 0 by default, and is provided for * compatibility */ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0; dst = NULL; xdst = NULL; route = NULL; sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto dropdst; if (num_pols) { if (num_xfrms <= 0) { drop_pols = num_pols; goto no_transform; } xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); if (err == -EREMOTE) goto nopol; goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; drop_pols = num_pols; goto no_transform; } route = xdst->route; } } if (xdst == NULL) { struct xfrm_flo xflo; xflo.dst_orig = dst_orig; xflo.flags = flags; /* To accelerate a bit... */ if (!if_id && ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { err = PTR_ERR(xdst); goto dropdst; } num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols); route = xdst->route; } dst = &xdst->u.dst; if (route == NULL && num_xfrms > 0) { /* The only case when xfrm_bundle_lookup() returns a * bundle with null route, is when the template could * not be resolved. It means policies are there, but * bundle could not be created, since we don't yet * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); err = -EREMOTE; goto error; } err = -EAGAIN; XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; } no_transform: if (num_pols == 0) goto nopol; if ((flags & XFRM_LOOKUP_ICMP) && !(pols[0]->flags & XFRM_POLICY_ICMP)) { err = -ENOENT; goto error; } for (i = 0; i < num_pols; i++) WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds()); if (num_xfrms < 0) { /* Prohibit the flow */ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; } else if (num_xfrms > 0) { /* Flow transformed */ dst_release(dst_orig); } else { /* Flow passes untransformed */ dst_release(dst); dst = dst_orig; } ok: xfrm_pols_put(pols, drop_pols); if (dst->xfrm && (dst->xfrm->props.mode == XFRM_MODE_TUNNEL || dst->xfrm->props.mode == XFRM_MODE_IPTFS)) dst->flags |= DST_XFRM_TUNNEL; return dst; nopol: if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) && net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { err = -EPERM; goto error; } if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; } err = -ENOENT; error: dst_release(dst); dropdst: if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } EXPORT_SYMBOL(xfrm_lookup_with_ifid); /* Main function: finds/creates a bundle for given flow. * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); } EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). * Otherwise we may send out blackholed packets. */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); if (PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) dst_release(dst_orig); return dst; } EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; if (!sp || idx < 0 || idx >= sp->len) return 0; x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); } /* When skb is transformed back to its "native" form, we have to * check policy restrictions. At the moment we make this in maximally * stupid way. Shame on me. :-) Of course, connected sockets must * have policy cached at them. */ static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family, u32 if_id) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->props.reqid == tmpl->reqid || !tmpl->reqid) && x->props.mode == tmpl->mode && (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && xfrm_state_addr_cmp(tmpl, x, family)) && (if_id == 0 || if_id == x->if_id); } /* * 0 or more than 0 is returned when validation is succeeded (either bypass * because of optional transport mode, or next index of the matched secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, unsigned short family, u32 if_id) { int idx = start; if (tmpl->optional) { if (tmpl->mode == XFRM_MODE_TRANSPORT) return start; } else start = -1; for (; idx < sp->len; idx++) { if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (idx < sp->verified_cnt) { /* Secpath entry previously verified, consider optional and * continue searching */ continue; } if (start == -1) start = -2-idx; break; } } return start; } static void decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) { struct flowi4 *fl4 = &fl->u.ip4; memset(fl4, 0, sizeof(struct flowi4)); if (reverse) { fl4->saddr = flkeys->addrs.ipv4.dst; fl4->daddr = flkeys->addrs.ipv4.src; fl4->fl4_sport = flkeys->ports.dst; fl4->fl4_dport = flkeys->ports.src; } else { fl4->saddr = flkeys->addrs.ipv4.src; fl4->daddr = flkeys->addrs.ipv4.dst; fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; } switch (flkeys->basic.ip_proto) { case IPPROTO_GRE: fl4->fl4_gre_key = flkeys->gre.keyid; break; case IPPROTO_ICMP: fl4->fl4_icmp_type = flkeys->icmp.type; fl4->fl4_icmp_code = flkeys->icmp.code; break; } fl4->flowi4_proto = flkeys->basic.ip_proto; fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; } #if IS_ENABLED(CONFIG_IPV6) static void decode_session6(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reverse) { struct flowi6 *fl6 = &fl->u.ip6; memset(fl6, 0, sizeof(struct flowi6)); if (reverse) { fl6->saddr = flkeys->addrs.ipv6.dst; fl6->daddr = flkeys->addrs.ipv6.src; fl6->fl6_sport = flkeys->ports.dst; fl6->fl6_dport = flkeys->ports.src; } else { fl6->saddr = flkeys->addrs.ipv6.src; fl6->daddr = flkeys->addrs.ipv6.dst; fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; } switch (flkeys->basic.ip_proto) { case IPPROTO_GRE: fl6->fl6_gre_key = flkeys->gre.keyid; break; case IPPROTO_ICMPV6: fl6->fl6_icmp_type = flkeys->icmp.type; fl6->fl6_icmp_code = flkeys->icmp.code; break; } fl6->flowi6_proto = flkeys->basic.ip_proto; } #endif int __xfrm_decode_session(struct net *net, struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { struct xfrm_flow_keys flkeys; memset(&flkeys, 0, sizeof(flkeys)); __skb_flow_dissect(net, skb, &xfrm_session_dissector, &flkeys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_ENCAP); switch (family) { case AF_INET: decode_session4(&flkeys, fl, reverse); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: decode_session6(&flkeys, fl, reverse); break; #endif default: return -EAFNOSUPPORT; } fl->flowi_mark = skb->mark; if (reverse) { fl->flowi_oif = skb->skb_iif; } else { int oif = 0; if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; fl->flowi_oif = oif; } return security_xfrm_decode_session(skb, &fl->flowi_secid); } EXPORT_SYMBOL(__xfrm_decode_session); static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp) { for (; k < sp->len; k++) { if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) { *idxp = k; return 1; } } return 0; } static bool icmp_err_packet(const struct flowi *fl, unsigned short family) { const struct flowi4 *fl4 = &fl->u.ip4; if (family == AF_INET && fl4->flowi4_proto == IPPROTO_ICMP && (fl4->fl4_icmp_type == ICMP_DEST_UNREACH || fl4->fl4_icmp_type == ICMP_TIME_EXCEEDED)) return true; #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) { const struct flowi6 *fl6 = &fl->u.ip6; if (fl6->flowi6_proto == IPPROTO_ICMPV6 && (fl6->fl6_icmp_type == ICMPV6_DEST_UNREACH || fl6->fl6_icmp_type == ICMPV6_PKT_TOOBIG || fl6->fl6_icmp_type == ICMPV6_TIME_EXCEED)) return true; } #endif return false; } static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, const struct flowi *fl, struct flowi *fl1) { bool ret = true; struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); int hl = family == AF_INET ? (sizeof(struct iphdr) + sizeof(struct icmphdr)) : (sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr)); if (!newskb) return true; if (!pskb_pull(newskb, hl)) goto out; skb_reset_network_header(newskb); if (xfrm_decode_session_reverse(dev_net(skb->dev), newskb, fl1, family) < 0) goto out; fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; fl1->flowi_tos = fl->flowi_tos; nf_nat_decode_session(newskb, fl1, family); ret = false; out: consume_skb(newskb); return ret; } static bool xfrm_selector_inner_icmp_match(struct sk_buff *skb, unsigned short family, const struct xfrm_selector *sel, const struct flowi *fl) { bool ret = false; if (icmp_err_packet(fl, family)) { struct flowi fl1; if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return ret; ret = xfrm_selector_match(sel, &fl1, family); } return ret; } static inline struct xfrm_policy *xfrm_in_fwd_icmp(struct sk_buff *skb, const struct flowi *fl, unsigned short family, u32 if_id) { struct xfrm_policy *pol = NULL; if (icmp_err_packet(fl, family)) { struct flowi fl1; struct net *net = dev_net(skb->dev); if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return pol; pol = xfrm_policy_lookup(net, &fl1, family, XFRM_POLICY_FWD, if_id); if (IS_ERR(pol)) pol = NULL; } return pol; } static inline struct dst_entry *xfrm_out_fwd_icmp(struct sk_buff *skb, struct flowi *fl, unsigned short family, struct dst_entry *dst) { if (icmp_err_packet(fl, family)) { struct net *net = dev_net(skb->dev); struct dst_entry *dst2; struct flowi fl1; if (xfrm_icmp_flow_decode(skb, family, fl, &fl1)) return dst; dst_hold(dst); dst2 = xfrm_lookup(net, dst, &fl1, NULL, (XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_ICMP)); if (IS_ERR(dst2)) return dst; if (dst2->xfrm) { dst_release(dst); dst = dst2; } else { dst_release(dst2); } } return dst; } int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct xfrm_policy *pol; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int npols = 0; int xfrm_nr; int pi; int reverse; struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct sec_path *sp; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { struct xfrm_if_decode_session_result r; if (ifcb->decode_session(skb, family, &r)) { if_id = r.if_id; net = r.net; } } rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; if (__xfrm_decode_session(net, skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); return 0; } nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ sp = skb_sec_path(skb); if (sp) { int i; for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; int ret = 0; if (!xfrm_selector_match(&x->sel, &fl, family)) { ret = 1; if (x->props.flags & XFRM_STATE_ICMP && xfrm_selector_inner_icmp_match(skb, family, &x->sel, &fl)) ret = 0; if (ret) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; } } } } pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } } if (!pol) pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } if (!pol && dir == XFRM_POLICY_FWD) pol = xfrm_in_fwd_icmp(skb, &fl, family, if_id); if (!pol) { const bool is_crypto_offload = sp && (xfrm_input_state(skb)->xso.type == XFRM_DEV_OFFLOAD_CRYPTO); if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } if (sp && secpath_has_nontransport(sp, 0, &xerr_idx) && !is_crypto_offload) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } return 1; } /* This lockless write can happen from different cpus. */ WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); xfrm_pol_put(pols[0]); return 0; } /* This write can happen from different cpus. */ WRITE_ONCE(pols[1]->curlft.use_time, ktime_get_real_seconds()); npols++; } } #endif if (pol->action == XFRM_POLICY_ALLOW) { static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; struct xfrm_tmpl **tpp = tp; int ti = 0; int i, k; sp = skb_sec_path(skb); if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { if (pols[pi] != pol && pols[pi]->action != XFRM_POLICY_ALLOW) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); goto reject; } if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto reject_error; } for (i = 0; i < pols[pi]->xfrm_nr; i++) tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } /* For each tunnel xfrm, find the first matching tmpl. * For each tmpl before that, find corresponding xfrm. * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. * Upon success, marks secpath entries as having been * verified to allow them to be skipped in future policy * checks (e.g. nested tunnels). */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ xerr_idx = -(2+k); XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } } if (secpath_has_nontransport(sp, k, &xerr_idx)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } xfrm_pols_put(pols, npols); sp->verified_cnt = k; return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); reject: xfrm_secpath_reject(xerr_idx, skb, &fl); reject_error: xfrm_pols_put(pols, npols); return 0; } EXPORT_SYMBOL(__xfrm_policy_check); int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct flowi fl; struct dst_entry *dst; int res = 1; if (xfrm_decode_session(net, skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } if (dst && !dst->xfrm) dst = xfrm_out_fwd_icmp(skb, &fl, family, dst); skb_dst_set(skb, dst); return res; } EXPORT_SYMBOL(__xfrm_route_forward); /* Optimize later using cookies and generation ids. */ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to * get validated by dst_ops->check on every use. We do this * because when a normal route referenced by an XFRM dst is * obsoleted we do not go looking around for all parent * referencing XFRM dsts so that we can invalidate them. It * is just too much work. Instead we make the checks here on * every use. For example: * * XFRM dst A --> IPv4 dst X * * X is the "xdst->route" of A (X is also the "dst->path" of A * in this example). If X is marked obsolete, "A" will not * notice. That's what we are validating here via the * stale_bundle() check. * * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; return NULL; } static int stale_bundle(struct dst_entry *dst) { return !xfrm_bundle_ok((struct xfrm_dst *)dst); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } } EXPORT_SYMBOL(xfrm_dst_ifdown); static void xfrm_link_failure(struct sk_buff *skb) { /* Impossible. Such dst must be popped before reaches point of failure. */ } static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { if (dst->obsolete) sk_dst_reset(sk); } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { while (nr--) { struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; struct dst_entry *dst; dst = &xdst->u.dst; pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); route_mtu_cached = dst_mtu(xdst->route); xdst->route_mtu_cached = route_mtu_cached; if (pmtu > route_mtu_cached) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); } } /* Check that the bundle accepts the flow and its components are * still valid. */ static int xfrm_bundle_ok(struct xfrm_dst *first) { struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; struct xfrm_dst *xdst; int start_from, nr; u32 mtu; if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; if (xdst->xfrm_genid != dst->xfrm->genid) return 0; if (xdst->num_pols > 0 && xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; bundle[nr++] = xdst; mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { start_from = nr; xdst->child_mtu_cached = mtu; } if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { start_from = nr; xdst->route_mtu_cached = mtu; } dst = xfrm_dst_child(dst); } while (dst->xfrm); if (likely(!start_from)) return 1; xdst = bundle[start_from - 1]; mtu = xdst->child_mtu_cached; while (start_from--) { dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); if (mtu > xdst->route_mtu_cached) mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); if (!start_from) break; xdst = bundle[start_from - 1]; xdst->child_mtu_cached = mtu; } return 1; } static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; dst = xfrm_dst_child(dst); if (xfrm->props.mode == XFRM_MODE_TRANSPORT) continue; if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; } return daddr; } static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); return path->ops->neigh_lookup(path, skb, daddr); } static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); } int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family) { int err = 0; if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[family] != NULL)) err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; if (likely(dst_ops->default_advmss == NULL)) dst_ops->default_advmss = xfrm_default_advmss; if (likely(dst_ops->mtu == NULL)) dst_ops->mtu = xfrm_mtu; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(!dst_ops->confirm_neigh)) dst_ops->confirm_neigh = xfrm_confirm_neigh; rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo); } spin_unlock(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) { struct dst_ops *dst_ops = afinfo->dst_ops; int i; for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { if (xfrm_policy_afinfo[i] != afinfo) continue; RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); break; } synchronize_rcu(); dst_ops->kmem_cachep = NULL; dst_ops->check = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) { spin_lock(&xfrm_if_cb_lock); rcu_assign_pointer(xfrm_if_cb, ifcb); spin_unlock(&xfrm_if_cb_lock); } EXPORT_SYMBOL(xfrm_if_register_cb); void xfrm_if_unregister_cb(void) { RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_if_unregister_cb); #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { int rv; net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) { return 0; } static void xfrm_statistics_fini(struct net *net) { } #endif static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; int dir, err; if (net_eq(net, &init_net)) { xfrm_dst_cache = KMEM_CACHE(xfrm_dst, SLAB_HWCACHE_ALIGN | SLAB_PANIC); err = rhashtable_init(&xfrm_policy_inexact_table, &xfrm_pol_inexact_params); BUG_ON(err); } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); net->xfrm.policy_byidx = xfrm_hash_alloc(sz); if (!net->xfrm.policy_byidx) goto out_byidx; net->xfrm.policy_idx_hmask = hmask; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; htab = &net->xfrm.policy_bydst[dir]; htab->table = xfrm_hash_alloc(sz); if (!htab->table) goto out_bydst; htab->hmask = hmask; htab->dbits4 = 32; htab->sbits4 = 32; htab->dbits6 = 128; htab->sbits6 = 128; } net->xfrm.policy_hthresh.lbits4 = 32; net->xfrm.policy_hthresh.rbits4 = 32; net->xfrm.policy_hthresh.lbits6 = 128; net->xfrm.policy_hthresh.rbits6 = 128; seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; out_bydst: for (dir--; dir >= 0; dir--) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; xfrm_hash_free(htab->table, sz); } xfrm_hash_free(net->xfrm.policy_byidx, sz); out_byidx: return -ENOMEM; } static void xfrm_policy_fini(struct net *net) { struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); WARN_ON(!list_empty(&net->xfrm.policy_all)); for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(htab->table)); xfrm_hash_free(htab->table, sz); } sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); spin_lock_bh(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(b, true); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) { int rv; /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); net->xfrm.policy_default[XFRM_POLICY_IN] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_FWD] = XFRM_USERPOLICY_ACCEPT; net->xfrm.policy_default[XFRM_POLICY_OUT] = XFRM_USERPOLICY_ACCEPT; rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; rv = xfrm_state_init(net); if (rv < 0) goto out_state; rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; rv = xfrm_nat_keepalive_net_init(net); if (rv < 0) goto out_nat_keepalive; return 0; out_nat_keepalive: xfrm_sysctl_fini(net); out_sysctl: xfrm_policy_fini(net); out_policy: xfrm_state_fini(net); out_state: xfrm_statistics_fini(net); out_statistics: return rv; } static void __net_exit xfrm_net_exit(struct net *net) { xfrm_nat_keepalive_net_fini(net); xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); xfrm_statistics_fini(net); } static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, }; static const struct flow_dissector_key xfrm_flow_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct xfrm_flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct xfrm_flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct xfrm_flow_keys, addrs.ipv4), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct xfrm_flow_keys, addrs.ipv6), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct xfrm_flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct xfrm_flow_keys, gre), }, { .key_id = FLOW_DISSECTOR_KEY_IP, .offset = offsetof(struct xfrm_flow_keys, ip), }, { .key_id = FLOW_DISSECTOR_KEY_ICMP, .offset = offsetof(struct xfrm_flow_keys, icmp), }, }; void __init xfrm_init(void) { skb_flow_dissector_init(&xfrm_session_dissector, xfrm_flow_dissector_keys, ARRAY_SIZE(xfrm_flow_dissector_keys)); register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); xfrm_input_init(); #ifdef CONFIG_XFRM_ESPINTCP espintcp_init(); #endif register_xfrm_state_bpf(); xfrm_nat_keepalive_init(AF_INET); } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = xp->security; struct xfrm_selector *sel = &xp->selector; if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (sel->family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); if (sel->prefixlen_s != 32) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4); if (sel->prefixlen_d != 32) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6); if (sel->prefixlen_s != 128) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6); if (sel->prefixlen_d != 128) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; } } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol; struct flowi fl; memset(&fl, 0, sizeof(fl)); fl.flowi_proto = sel->proto; switch (sel->family) { case AF_INET: fl.u.ip4.saddr = sel->saddr.a4; fl.u.ip4.daddr = sel->daddr.a4; if (sel->proto == IPSEC_ULPROTO_ANY) break; fl.u.flowi4_oif = sel->ifindex; fl.u.ip4.fl4_sport = sel->sport; fl.u.ip4.fl4_dport = sel->dport; break; case AF_INET6: fl.u.ip6.saddr = sel->saddr.in6; fl.u.ip6.daddr = sel->daddr.in6; if (sel->proto == IPSEC_ULPROTO_ANY) break; fl.u.flowi6_oif = sel->ifindex; fl.u.ip6.fl4_sport = sel->sport; fl.u.ip6.fl4_dport = sel->dport; break; default: return ERR_PTR(-EAFNOSUPPORT); } rcu_read_lock(); pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id); if (IS_ERR_OR_NULL(pol)) goto out_unlock; if (!xfrm_pol_hold_rcu(pol)) pol = NULL; out_unlock: rcu_read_unlock(); return pol; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) { int match = 0; if (t->mode == m->mode && t->id.proto == m->proto && (m->reqid == 0 || t->reqid == m->reqid)) { switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: case XFRM_MODE_IPTFS: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, m->old_family)) { match = 1; } break; case XFRM_MODE_TRANSPORT: /* in case of transport mode, template does not store any IP addresses, hence we just compare mode and protocol */ match = 1; break; default: break; } } return match; } /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { struct xfrm_migrate *mp; int i, j, n = 0; write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ NL_SET_ERR_MSG(extack, "Target policy not found"); write_unlock_bh(&pol->lock); return -ENOENT; } for (i = 0; i < pol->xfrm_nr; i++) { for (j = 0, mp = m; j < num_migrate; j++, mp++) { if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i])) continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && pol->xfrm_vec[i].mode != XFRM_MODE_BEET && pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, sizeof(pol->xfrm_vec[i].id.daddr)); memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ atomic_inc(&pol->genid); } } write_unlock_bh(&pol->lock); if (!n) return -ENODATA; return 0; } static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate, struct netlink_ext_ack *extack) { int i, j; if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) { NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; } for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) { NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null"); return -EINVAL; } /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { if (!memcmp(&m[i].old_daddr, &m[j].old_daddr, sizeof(m[i].old_daddr)) && !memcmp(&m[i].old_saddr, &m[j].old_saddr, sizeof(m[i].old_saddr)) && m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && m[i].old_family == m[j].old_family) { NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique"); return -EINVAL; } } } return 0; } int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; struct xfrm_state *x, *xc; struct xfrm_state *x_cur[XFRM_MAX_DEPTH]; struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ err = xfrm_migrate_check(m, num_migrate, extack); if (err < 0) goto out; if (dir >= XFRM_POLICY_MAX) { NL_SET_ERR_MSG(extack, "Invalid policy direction"); err = -EINVAL; goto out; } /* Stage 1 - find policy */ pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); if (IS_ERR_OR_NULL(pol)) { NL_SET_ERR_MSG(extack, "Target policy not found"); err = IS_ERR(pol) ? PTR_ERR(pol) : -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack); if (xc) { x_new[nx_new] = xc; nx_new++; } else { err = -ENODATA; goto restore_state; } } } /* Stage 3 - update policy */ err = xfrm_policy_migrate(pol, m, num_migrate, extack); if (err < 0) goto restore_state; /* Stage 4 - delete old state(s) */ if (nx_cur) { xfrm_states_put(x_cur, nx_cur); xfrm_states_delete(x_cur, nx_cur); } /* Stage 5 - announce */ km_migrate(sel, dir, type, m, num_migrate, k, encap); xfrm_pol_put(pol); return 0; out: return err; restore_state: if (pol) xfrm_pol_put(pol); if (nx_cur) xfrm_states_put(x_cur, nx_cur); if (nx_new) xfrm_states_delete(x_new, nx_new); return err; } EXPORT_SYMBOL(xfrm_migrate); #endif |
| 1093 154 1093 154 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); if (!flow) return NULL; flow->rule = flow_rule_alloc(num_actions); if (!flow->rule) { kfree(flow); return NULL; } flow->rule->match.dissector = &flow->match.dissector; flow->rule->match.mask = &flow->match.mask; flow->rule->match.key = &flow->match.key; return flow; } void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type) { struct nft_flow_match *match = &flow->match; struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } struct nft_offload_ethertype { __be16 value; __be16 mask; }; static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; struct nft_offload_ethertype ethertype = { .value = match->key.basic.n_proto, .mask = match->mask.basic.n_proto, }; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); } else if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->offload_action && expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); } if (num_actions == 0) return ERR_PTR(-EOPNOTSUPP); flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto err_out; } ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } nft_flow_rule_transfer_vlan(ctx, flow); flow->proto = ctx->dep.l3num; kfree(ctx); return flow; err_out: kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); } void nft_flow_rule_destroy(struct nft_flow_rule *flow) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, &flow->rule->action) { switch (entry->id) { case FLOW_ACTION_REDIRECT: case FLOW_ACTION_MIRRED: dev_put(entry->dev); break; default: break; } } kfree(flow->rule); kfree(flow); } void nft_offload_set_dependency(struct nft_offload_ctx *ctx, enum nft_offload_dep_type type) { ctx->dep.type = type; } void nft_offload_update_dependency(struct nft_offload_ctx *ctx, const void *data, u32 len) { switch (ctx->dep.type) { case NFT_OFFLOAD_DEP_NETWORK: WARN_ON(len != sizeof(__u16)); memcpy(&ctx->dep.l3num, data, sizeof(__u16)); break; case NFT_OFFLOAD_DEP_TRANSPORT: WARN_ON(len != sizeof(__u8)); memcpy(&ctx->dep.protonum, data, sizeof(__u8)); break; default: break; } ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, __be16 proto, int priority, struct netlink_ext_ack *extack) { common->protocol = proto; common->prio = priority; common->extack = extack; } static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; } return 0; } static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) return -1; return 0; } bool nft_chain_offload_support(const struct nft_base_chain *basechain) { struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; if (nft_chain_offload_priority(basechain) < 0) return false; list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (ops->pf != NFPROTO_NETDEV || ops->hooknum != NF_NETDEV_INGRESS) return false; dev = ops->dev; if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) return false; } } return true; } static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, struct netlink_ext_ack *extack, enum flow_cls_command command) { __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) cls_flow->rule = flow->rule; } static int nft_flow_offload_cmd(const struct nft_chain *chain, const struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command, struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } static int nft_flow_offload_rule(const struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { struct flow_cls_offload cls_flow; return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); } int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule) { struct flow_cls_offload cls_flow = {}; struct nft_expr *expr, *next; int err; err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, &cls_flow); if (err < 0) return err; nft_rule_for_each_expr(expr, next, rule) { if (expr->ops->offload_stats) expr->ops->offload_stats(expr, &cls_flow.stats); } return 0; } static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } return 0; } static int nft_block_setup(struct nft_base_chain *basechain, struct flow_block_offload *bo, enum flow_block_command cmd) { int err; switch (cmd) { case FLOW_BLOCK_BIND: err = nft_flow_offload_bind(bo, basechain); break; case FLOW_BLOCK_UNBIND: err = nft_flow_offload_unbind(bo, basechain); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } static void nft_flow_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &basechain->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) return err; return nft_block_setup(chain, &bo, cmd); } static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; return nft_block_setup(basechain, &bo, cmd); } static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { int err; if (dev->netdev_ops->ndo_setup_tc) err = nft_block_offload_cmd(basechain, dev, cmd); else err = nft_indr_block_offload_cmd(basechain, dev, cmd); return err; } static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (this_dev && this_dev != ops->dev) continue; err = nft_chain_offload_cmd(basechain, ops->dev, cmd); if (err < 0 && cmd == FLOW_BLOCK_BIND) { if (!this_dev) goto err_flow_block; return err; } i++; } } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (i-- <= 0) break; nft_chain_offload_cmd(basechain, ops->dev, FLOW_BLOCK_UNBIND); } } return err; } static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; return nft_flow_block_chain(basechain, NULL, cmd); } static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; } if (WARN_ON_ONCE(err)) break; } } int nft_flow_rule_offload_commit(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->flags & NLM_F_REPLACE || !(trans->flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; } if (err) { nft_flow_rule_offload_abort(net, trans); break; } } return err; } static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, struct net_device *dev) { struct nft_base_chain *basechain; struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain) || !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { if (!nft_hook_find_ops(hook, dev)) continue; found = hook; break; } if (!found) continue; return chain; } } return NULL; } static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { unregister_netdevice_notifier(&nft_offload_netdev_notifier); } |
| 180 172 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* V4L2 device support header. Copyright (C) 2008 Hans Verkuil <hverkuil@xs4all.nl> */ #ifndef _V4L2_DEVICE_H #define _V4L2_DEVICE_H #include <media/media-device.h> #include <media/v4l2-subdev.h> #include <media/v4l2-dev.h> struct v4l2_ctrl_handler; /** * struct v4l2_device - main struct to for V4L2 device drivers * * @dev: pointer to struct device. * @mdev: pointer to struct media_device, may be NULL. * @subdevs: used to keep track of the registered subdevs * @lock: lock this struct; can be used by the driver as well * if this struct is embedded into a larger struct. * @name: unique device name, by default the driver name + bus ID * @notify: notify operation called by some sub-devices. * @ctrl_handler: The control handler. May be %NULL. * @prio: Device's priority state * @ref: Keep track of the references to this struct. * @release: Release function that is called when the ref count * goes to 0. * * Each instance of a V4L2 device should create the v4l2_device struct, * either stand-alone or embedded in a larger struct. * * It allows easy access to sub-devices (see v4l2-subdev.h) and provides * basic V4L2 device-level support. * * .. note:: * * #) @dev->driver_data points to this struct. * #) @dev might be %NULL if there is no parent device */ struct v4l2_device { struct device *dev; struct media_device *mdev; struct list_head subdevs; spinlock_t lock; char name[36]; void (*notify)(struct v4l2_subdev *sd, unsigned int notification, void *arg); struct v4l2_ctrl_handler *ctrl_handler; struct v4l2_prio_state prio; struct kref ref; void (*release)(struct v4l2_device *v4l2_dev); }; /** * v4l2_device_get - gets a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to increment the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ static inline void v4l2_device_get(struct v4l2_device *v4l2_dev) { kref_get(&v4l2_dev->ref); } /** * v4l2_device_put - puts a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to decrement the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ int v4l2_device_put(struct v4l2_device *v4l2_dev); /** * v4l2_device_register - Initialize v4l2_dev and make @dev->driver_data * point to @v4l2_dev. * * @dev: pointer to struct &device * @v4l2_dev: pointer to struct &v4l2_device * * .. note:: * @dev may be %NULL in rare cases (ISA devices). * In such case the caller must fill in the @v4l2_dev->name field * before calling this function. */ int __must_check v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev); /** * v4l2_device_set_name - Optional function to initialize the * name field of struct &v4l2_device * * @v4l2_dev: pointer to struct &v4l2_device * @basename: base name for the device name * @instance: pointer to a static atomic_t var with the instance usage for * the device driver. * * v4l2_device_set_name() initializes the name field of struct &v4l2_device * using the driver name and a driver-global atomic_t instance. * * This function will increment the instance counter and returns the * instance value used in the name. * * Example: * * static atomic_t drv_instance = ATOMIC_INIT(0); * * ... * * instance = v4l2_device_set_name(&\ v4l2_dev, "foo", &\ drv_instance); * * The first time this is called the name field will be set to foo0 and * this function returns 0. If the name ends with a digit (e.g. cx18), * then the name will be set to cx18-0 since cx180 would look really odd. */ int v4l2_device_set_name(struct v4l2_device *v4l2_dev, const char *basename, atomic_t *instance); /** * v4l2_device_disconnect - Change V4L2 device state to disconnected. * * @v4l2_dev: pointer to struct v4l2_device * * Should be called when the USB parent disconnects. * Since the parent disappears, this ensures that @v4l2_dev doesn't have * an invalid parent pointer. * * .. note:: This function sets @v4l2_dev->dev to NULL. */ void v4l2_device_disconnect(struct v4l2_device *v4l2_dev); /** * v4l2_device_unregister - Unregister all sub-devices and any other * resources related to @v4l2_dev. * * @v4l2_dev: pointer to struct v4l2_device */ void v4l2_device_unregister(struct v4l2_device *v4l2_dev); /** * v4l2_device_register_subdev - Registers a subdev with a v4l2 device. * * @v4l2_dev: pointer to struct &v4l2_device * @sd: pointer to &struct v4l2_subdev * * While registered, the subdev module is marked as in-use. * * An error is returned if the module is no longer loaded on any attempts * to register it. */ #define v4l2_device_register_subdev(v4l2_dev, sd) \ __v4l2_device_register_subdev(v4l2_dev, sd, THIS_MODULE) int __must_check __v4l2_device_register_subdev(struct v4l2_device *v4l2_dev, struct v4l2_subdev *sd, struct module *module); /** * v4l2_device_unregister_subdev - Unregisters a subdev with a v4l2 device. * * @sd: pointer to &struct v4l2_subdev * * .. note :: * * Can also be called if the subdev wasn't registered. In such * case, it will do nothing. */ void v4l2_device_unregister_subdev(struct v4l2_subdev *sd); /** * __v4l2_device_register_subdev_nodes - Registers device nodes for * all subdevs of the v4l2 device that are marked with the * %V4L2_SUBDEV_FL_HAS_DEVNODE flag. * * @v4l2_dev: pointer to struct v4l2_device * @read_only: subdevices read-only flag. True to register the subdevices * device nodes in read-only mode, false to allow full access to the * subdevice userspace API. */ int __must_check __v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev, bool read_only); /** * v4l2_device_register_subdev_nodes - Registers subdevices device nodes with * unrestricted access to the subdevice userspace operations * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, false); #else return 0; #endif } /** * v4l2_device_register_ro_subdev_nodes - Registers subdevices device nodes * in read-only mode * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_ro_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, true); #else return 0; #endif } /** * v4l2_subdev_notify - Sends a notification to v4l2_device. * * @sd: pointer to &struct v4l2_subdev * @notification: type of notification. Please notice that the notification * type is driver-specific. * @arg: arguments for the notification. Those are specific to each * notification type. */ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd, unsigned int notification, void *arg) { if (sd && sd->v4l2_dev && sd->v4l2_dev->notify) sd->v4l2_dev->notify(sd, notification, arg); } /** * v4l2_device_supports_requests - Test if requests are supported. * * @v4l2_dev: pointer to struct v4l2_device */ static inline bool v4l2_device_supports_requests(struct v4l2_device *v4l2_dev) { return v4l2_dev->mdev && v4l2_dev->mdev->ops && v4l2_dev->mdev->ops->req_queue; } /* Helper macros to iterate over all subdevs. */ /** * v4l2_device_for_each_subdev - Helper macro that interates over all * sub-devices of a given &v4l2_device. * * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * * This macro iterates over all sub-devices owned by the @v4l2_dev device. * It acts as a for loop iterator and executes the next statement with * the @sd variable pointing to each sub-device in turn. */ #define v4l2_device_for_each_subdev(sd, v4l2_dev) \ list_for_each_entry(sd, &(v4l2_dev)->subdevs, list) /** * __v4l2_device_call_subdevs_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_p(v4l2_dev, sd, cond, o, f, args...) \ do { \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ (sd)->ops->o->f((sd) , ##args); \ } while (0) /** * __v4l2_device_call_subdevs - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs(v4l2_dev, cond, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ } while (0) /** * __v4l2_device_call_subdevs_until_err_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev sub-devices associated with @v4l2_dev. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, zero * otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err_p(v4l2_dev, sd, cond, o, f, args...) \ ({ \ long __err = 0; \ \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) { \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ __err = (sd)->ops->o->f((sd) , ##args); \ if (__err && __err != -ENOIOCTLCMD) \ break; \ } \ (__err == -ENOIOCTLCMD) ? 0 : __err; \ }) /** * __v4l2_device_call_subdevs_until_err - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err(v4l2_dev, cond, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ }) /** * v4l2_device_call_all - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_all(v4l2_dev, grpid, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ } while (0) /** * v4l2_device_call_until_err - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver, until an error occurs. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_until_err(v4l2_dev, grpid, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ }) /** * v4l2_device_mask_call_all - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_all(v4l2_dev, grpmsk, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ } while (0) /** * v4l2_device_mask_call_until_err - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_until_err(v4l2_dev, grpmsk, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ }) /** * v4l2_device_has_op - checks if any subdev with matching grpid has a * given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_has_op(v4l2_dev, grpid, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpid) && __sd->grp_id != (grpid)) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) /** * v4l2_device_mask_has_op - checks if any subdev with matching group * mask has a given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_mask_has_op(v4l2_dev, grpmsk, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpmsk) && !(__sd->grp_id & (grpmsk))) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) #endif |
| 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 | // SPDX-License-Identifier: GPL-2.0-or-later /* FS-Cache cache handling * * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define FSCACHE_DEBUG_LEVEL CACHE #include <linux/export.h> #include <linux/slab.h> #include "internal.h" static LIST_HEAD(fscache_caches); DECLARE_RWSEM(fscache_addremove_sem); EXPORT_SYMBOL(fscache_addremove_sem); DECLARE_WAIT_QUEUE_HEAD(fscache_clearance_waiters); EXPORT_SYMBOL(fscache_clearance_waiters); static atomic_t fscache_cache_debug_id; /* * Allocate a cache cookie. */ static struct fscache_cache *fscache_alloc_cache(const char *name) { struct fscache_cache *cache; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache) { if (name) { cache->name = kstrdup(name, GFP_KERNEL); if (!cache->name) { kfree(cache); return NULL; } } refcount_set(&cache->ref, 1); INIT_LIST_HEAD(&cache->cache_link); cache->debug_id = atomic_inc_return(&fscache_cache_debug_id); } return cache; } static bool fscache_get_cache_maybe(struct fscache_cache *cache, enum fscache_cache_trace where) { bool success; int ref; success = __refcount_inc_not_zero(&cache->ref, &ref); if (success) trace_fscache_cache(cache->debug_id, ref + 1, where); return success; } /* * Look up a cache cookie. */ struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache) { struct fscache_cache *candidate, *cache, *unnamed = NULL; /* firstly check for the existence of the cache under read lock */ down_read(&fscache_addremove_sem); list_for_each_entry(cache, &fscache_caches, cache_link) { if (cache->name && name && strcmp(cache->name, name) == 0 && fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) goto got_cache_r; if (!cache->name && !name && fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) goto got_cache_r; } if (!name) { list_for_each_entry(cache, &fscache_caches, cache_link) { if (cache->name && fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) goto got_cache_r; } } up_read(&fscache_addremove_sem); /* the cache does not exist - create a candidate */ candidate = fscache_alloc_cache(name); if (!candidate) return ERR_PTR(-ENOMEM); /* write lock, search again and add if still not present */ down_write(&fscache_addremove_sem); list_for_each_entry(cache, &fscache_caches, cache_link) { if (cache->name && name && strcmp(cache->name, name) == 0 && fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) goto got_cache_w; if (!cache->name) { unnamed = cache; if (!name && fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) goto got_cache_w; } } if (unnamed && is_cache && fscache_get_cache_maybe(unnamed, fscache_cache_get_acquire)) goto use_unnamed_cache; if (!name) { list_for_each_entry(cache, &fscache_caches, cache_link) { if (cache->name && fscache_get_cache_maybe(cache, fscache_cache_get_acquire)) goto got_cache_w; } } list_add_tail(&candidate->cache_link, &fscache_caches); trace_fscache_cache(candidate->debug_id, refcount_read(&candidate->ref), fscache_cache_new_acquire); up_write(&fscache_addremove_sem); return candidate; got_cache_r: up_read(&fscache_addremove_sem); return cache; use_unnamed_cache: cache = unnamed; cache->name = candidate->name; candidate->name = NULL; got_cache_w: up_write(&fscache_addremove_sem); kfree(candidate->name); kfree(candidate); return cache; } /** * fscache_acquire_cache - Acquire a cache-level cookie. * @name: The name of the cache. * * Get a cookie to represent an actual cache. If a name is given and there is * a nameless cache record available, this will acquire that and set its name, * directing all the volumes using it to this cache. * * The cache will be switched over to the preparing state if not currently in * use, otherwise -EBUSY will be returned. */ struct fscache_cache *fscache_acquire_cache(const char *name) { struct fscache_cache *cache; ASSERT(name); cache = fscache_lookup_cache(name, true); if (IS_ERR(cache)) return cache; if (!fscache_set_cache_state_maybe(cache, FSCACHE_CACHE_IS_NOT_PRESENT, FSCACHE_CACHE_IS_PREPARING)) { pr_warn("Cache tag %s in use\n", name); fscache_put_cache(cache, fscache_cache_put_cache); return ERR_PTR(-EBUSY); } return cache; } EXPORT_SYMBOL(fscache_acquire_cache); /** * fscache_put_cache - Release a cache-level cookie. * @cache: The cache cookie to be released * @where: An indication of where the release happened * * Release the caller's reference on a cache-level cookie. The @where * indication should give information about the circumstances in which the call * occurs and will be logged through a tracepoint. */ void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where) { unsigned int debug_id; bool zero; int ref; if (IS_ERR_OR_NULL(cache)) return; debug_id = cache->debug_id; zero = __refcount_dec_and_test(&cache->ref, &ref); trace_fscache_cache(debug_id, ref - 1, where); if (zero) { down_write(&fscache_addremove_sem); list_del_init(&cache->cache_link); up_write(&fscache_addremove_sem); kfree(cache->name); kfree(cache); } } /** * fscache_relinquish_cache - Reset cache state and release cookie * @cache: The cache cookie to be released * * Reset the state of a cache and release the caller's reference on a cache * cookie. */ void fscache_relinquish_cache(struct fscache_cache *cache) { enum fscache_cache_trace where = (cache->state == FSCACHE_CACHE_IS_PREPARING) ? fscache_cache_put_prep_failed : fscache_cache_put_relinquish; cache->ops = NULL; cache->cache_priv = NULL; fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT); fscache_put_cache(cache, where); } EXPORT_SYMBOL(fscache_relinquish_cache); /** * fscache_add_cache - Declare a cache as being open for business * @cache: The cache-level cookie representing the cache * @ops: Table of cache operations to use * @cache_priv: Private data for the cache record * * Add a cache to the system, making it available for netfs's to use. * * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ int fscache_add_cache(struct fscache_cache *cache, const struct fscache_cache_ops *ops, void *cache_priv) { int n_accesses; _enter("{%s,%s}", ops->name, cache->name); BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING); /* Get a ref on the cache cookie and keep its n_accesses counter raised * by 1 to prevent wakeups from transitioning it to 0 until we're * withdrawing caching services from it. */ n_accesses = atomic_inc_return(&cache->n_accesses); trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), n_accesses, fscache_access_cache_pin); down_write(&fscache_addremove_sem); cache->ops = ops; cache->cache_priv = cache_priv; fscache_set_cache_state(cache, FSCACHE_CACHE_IS_ACTIVE); up_write(&fscache_addremove_sem); pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name); _leave(" = 0 [%s]", cache->name); return 0; } EXPORT_SYMBOL(fscache_add_cache); /** * fscache_begin_cache_access - Pin a cache so it can be accessed * @cache: The cache-level cookie * @why: An indication of the circumstances of the access for tracing * * Attempt to pin the cache to prevent it from going away whilst we're * accessing it and returns true if successful. This works as follows: * * (1) If the cache tests as not live (state is not FSCACHE_CACHE_IS_ACTIVE), * then we return false to indicate access was not permitted. * * (2) If the cache tests as live, then we increment the n_accesses count and * then recheck the liveness, ending the access if it ceased to be live. * * (3) When we end the access, we decrement n_accesses and wake up the any * waiters if it reaches 0. * * (4) Whilst the cache is caching, n_accesses is kept artificially * incremented to prevent wakeups from happening. * * (5) When the cache is taken offline, the state is changed to prevent new * accesses, n_accesses is decremented and we wait for n_accesses to * become 0. */ bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why) { int n_accesses; if (!fscache_cache_is_live(cache)) return false; n_accesses = atomic_inc_return(&cache->n_accesses); smp_mb__after_atomic(); /* Reread live flag after n_accesses */ trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), n_accesses, why); if (!fscache_cache_is_live(cache)) { fscache_end_cache_access(cache, fscache_access_unlive); return false; } return true; } /** * fscache_end_cache_access - Unpin a cache at the end of an access. * @cache: The cache-level cookie * @why: An indication of the circumstances of the access for tracing * * Unpin a cache after we've accessed it. The @why indicator is merely * provided for tracing purposes. */ void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why) { int n_accesses; smp_mb__before_atomic(); n_accesses = atomic_dec_return(&cache->n_accesses); trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), n_accesses, why); if (n_accesses == 0) wake_up_var(&cache->n_accesses); } /** * fscache_io_error - Note a cache I/O error * @cache: The record describing the cache * * Note that an I/O error occurred in a cache and that it should no longer be * used for anything. This also reports the error into the kernel log. * * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_io_error(struct fscache_cache *cache) { if (fscache_set_cache_state_maybe(cache, FSCACHE_CACHE_IS_ACTIVE, FSCACHE_CACHE_GOT_IOERROR)) pr_err("Cache '%s' stopped due to I/O error\n", cache->name); } EXPORT_SYMBOL(fscache_io_error); /** * fscache_withdraw_cache - Withdraw a cache from the active service * @cache: The cache cookie * * Begin the process of withdrawing a cache from service. This stops new * cache-level and volume-level accesses from taking place and waits for * currently ongoing cache-level accesses to end. */ void fscache_withdraw_cache(struct fscache_cache *cache) { int n_accesses; pr_notice("Withdrawing cache \"%s\" (%u objs)\n", cache->name, atomic_read(&cache->object_count)); fscache_set_cache_state(cache, FSCACHE_CACHE_IS_WITHDRAWN); /* Allow wakeups on dec-to-0 */ n_accesses = atomic_dec_return(&cache->n_accesses); trace_fscache_access_cache(cache->debug_id, refcount_read(&cache->ref), n_accesses, fscache_access_cache_unpin); wait_var_event(&cache->n_accesses, atomic_read(&cache->n_accesses) == 0); } EXPORT_SYMBOL(fscache_withdraw_cache); #ifdef CONFIG_PROC_FS static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW"; /* * Generate a list of caches in /proc/fs/fscache/caches */ static int fscache_caches_seq_show(struct seq_file *m, void *v) { struct fscache_cache *cache; if (v == &fscache_caches) { seq_puts(m, "CACHE REF VOLS OBJS ACCES S NAME\n" "======== ===== ===== ===== ===== = ===============\n" ); return 0; } cache = list_entry(v, struct fscache_cache, cache_link); seq_printf(m, "%08x %5d %5d %5d %5d %c %s\n", cache->debug_id, refcount_read(&cache->ref), atomic_read(&cache->n_volumes), atomic_read(&cache->object_count), atomic_read(&cache->n_accesses), fscache_cache_states[cache->state], cache->name ?: "-"); return 0; } static void *fscache_caches_seq_start(struct seq_file *m, loff_t *_pos) __acquires(fscache_addremove_sem) { down_read(&fscache_addremove_sem); return seq_list_start_head(&fscache_caches, *_pos); } static void *fscache_caches_seq_next(struct seq_file *m, void *v, loff_t *_pos) { return seq_list_next(v, &fscache_caches, _pos); } static void fscache_caches_seq_stop(struct seq_file *m, void *v) __releases(fscache_addremove_sem) { up_read(&fscache_addremove_sem); } const struct seq_operations fscache_caches_seq_ops = { .start = fscache_caches_seq_start, .next = fscache_caches_seq_next, .stop = fscache_caches_seq_stop, .show = fscache_caches_seq_show, }; #endif /* CONFIG_PROC_FS */ |
| 609 609 609 609 609 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * * Based on net/ipv4/xfrm4_tunnel.c */ #include <linux/module.h> #include <linux/xfrm.h> #include <linux/slab.h> #include <linux/rculist.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <net/netns/generic.h> #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 #define XFRM6_TUNNEL_SPI_MIN 1 #define XFRM6_TUNNEL_SPI_MAX 0xffffffff struct xfrm6_tunnel_net { struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; u32 spi; }; static unsigned int xfrm6_tunnel_net_id __read_mostly; static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) { return net_generic(net, xfrm6_tunnel_net_id); } /* * xfrm_tunnel_spi things are for allocating unique id ("spi") * per xfrm_address_t. */ struct xfrm6_tunnel_spi { struct hlist_node list_byaddr; struct hlist_node list_byspi; xfrm_address_t addr; u32 spi; refcount_t refcnt; struct rcu_head rcu_head; }; static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) { unsigned int h; h = ipv6_addr_hash((const struct in6_addr *)addr); h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; return h; } static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi) { return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; } static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; hlist_for_each_entry_rcu(x6spi, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } return NULL; } __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; rcu_read_lock_bh(); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); spi = x6spi ? x6spi->spi : 0; rcu_read_unlock_bh(); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; int index = xfrm6_tunnel_spi_hash_byspi(spi); hlist_for_each_entry(x6spi, &xfrm6_tn->spi_byspi[index], list_byspi) { if (x6spi->spi == spi) return -1; } return index; } static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); u32 spi; struct xfrm6_tunnel_spi *x6spi; int index; if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; else xfrm6_tn->spi++; for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; if (spi == XFRM6_TUNNEL_SPI_MAX) break; } for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; } spi = 0; goto out; alloc_spi: xfrm6_tn->spi = spi; x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); if (!x6spi) goto out; memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); x6spi->spi = spi; refcount_set(&x6spi->refcnt, 1); hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); index = xfrm6_tunnel_spi_hash_byaddr(saddr); hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); out: return spi; } __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; spin_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); if (x6spi) { refcount_inc(&x6spi->refcnt); spi = x6spi->spi; } else spi = __xfrm6_tunnel_alloc_spi(net, saddr); spin_unlock_bh(&xfrm6_tunnel_spi_lock); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); static void x6spi_destroy_rcu(struct rcu_head *head) { kmem_cache_free(xfrm6_tunnel_spi_kmem, container_of(head, struct xfrm6_tunnel_spi, rcu_head)); } static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; struct hlist_node *n; spin_lock_bh(&xfrm6_tunnel_spi_lock); hlist_for_each_entry_safe(x6spi, n, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) { if (refcount_dec_and_test(&x6spi->refcnt)) { hlist_del_rcu(&x6spi->list_byaddr); hlist_del_rcu(&x6spi->list_byspi); call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu); break; } } } spin_unlock_bh(&xfrm6_tunnel_spi_lock); } static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { return skb_network_header(skb)[IP6CB(skb)->nhoff]; } static int xfrm6_tunnel_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = ipv6_hdr(skb); __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* xfrm6_tunnel native err handling */ switch (type) { case ICMPV6_DEST_UNREACH: switch (code) { case ICMPV6_NOROUTE: case ICMPV6_ADM_PROHIBITED: case ICMPV6_NOT_NEIGHBOUR: case ICMPV6_ADDR_UNREACH: case ICMPV6_PORT_UNREACH: default: break; } break; case ICMPV6_PKT_TOOBIG: break; case ICMPV6_TIME_EXCEED: switch (code) { case ICMPV6_EXC_HOPLIMIT: break; case ICMPV6_EXC_FRAGTIME: default: break; } break; case ICMPV6_PARAMPROB: switch (code) { case ICMPV6_HDR_FIELD: break; case ICMPV6_UNK_NEXTHDR: break; case ICMPV6_UNK_OPTION: break; } break; default: break; } return 0; } static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->props.mode != XFRM_MODE_TUNNEL) { NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode"); return -EINVAL; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation"); return -EINVAL; } x->props.header_len = sizeof(struct ipv6hdr); return 0; } static void xfrm6_tunnel_destroy(struct xfrm_state *x) { struct net *net = xs_net(x); xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); } static const struct xfrm_type xfrm6_tunnel_type = { .owner = THIS_MODULE, .proto = IPPROTO_IPV6, .init_state = xfrm6_tunnel_init_state, .destructor = xfrm6_tunnel_destroy, .input = xfrm6_tunnel_input, .output = xfrm6_tunnel_output, }; static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static int __net_init xfrm6_tunnel_net_init(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); xfrm6_tn->spi = 0; return 0; } static void __net_exit xfrm6_tunnel_net_exit(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; xfrm_flush_gc(); xfrm_state_flush(net, 0, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i])); } static struct pernet_operations xfrm6_tunnel_net_ops = { .init = xfrm6_tunnel_net_init, .exit = xfrm6_tunnel_net_exit, .id = &xfrm6_tunnel_net_id, .size = sizeof(struct xfrm6_tunnel_net), }; static int __init xfrm6_tunnel_init(void) { int rv; xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN); if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); if (rv < 0) goto out_pernet; rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); if (rv < 0) goto out_type; rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); if (rv < 0) goto out_xfrm6; rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); if (rv < 0) goto out_xfrm46; return 0; out_xfrm46: xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); out_xfrm6: xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); out_type: unregister_pernet_subsys(&xfrm6_tunnel_net_ops); out_pernet: kmem_cache_destroy(xfrm6_tunnel_spi_kmem); return rv; } static void __exit xfrm6_tunnel_fini(void) { xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); unregister_pernet_subsys(&xfrm6_tunnel_net_ops); /* Someone maybe has gotten the xfrm6_tunnel_spi. * So need to wait it. */ rcu_barrier(); kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } module_init(xfrm6_tunnel_init); module_exit(xfrm6_tunnel_fini); MODULE_DESCRIPTION("IPv6 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6); |
| 10832 10832 7775 7787 7830 9700 7495 9700 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor label definitions * * Copyright 2017 Canonical Ltd. */ #ifndef __AA_LABEL_H #define __AA_LABEL_H #include <linux/atomic.h> #include <linux/audit.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> #include "apparmor.h" #include "lib.h" struct aa_ns; #define LOCAL_VEC_ENTRIES 8 #define DEFINE_VEC(T, V) \ struct aa_ ## T *(_ ## V ## _localtmp)[LOCAL_VEC_ENTRIES]; \ struct aa_ ## T **(V) #define vec_setup(T, V, N, GFP) \ ({ \ if ((N) <= LOCAL_VEC_ENTRIES) { \ typeof(N) i; \ (V) = (_ ## V ## _localtmp); \ for (i = 0; i < (N); i++) \ (V)[i] = NULL; \ } else \ (V) = kzalloc(sizeof(struct aa_ ## T *) * (N), (GFP)); \ (V) ? 0 : -ENOMEM; \ }) #define vec_cleanup(T, V, N) \ do { \ int i; \ for (i = 0; i < (N); i++) { \ if (!IS_ERR_OR_NULL((V)[i])) \ aa_put_ ## T((V)[i]); \ } \ if ((V) != _ ## V ## _localtmp) \ kfree(V); \ } while (0) #define vec_last(VEC, SIZE) ((VEC)[(SIZE) - 1]) #define vec_ns(VEC, SIZE) (vec_last((VEC), (SIZE))->ns) #define vec_labelset(VEC, SIZE) (&vec_ns((VEC), (SIZE))->labels) #define cleanup_domain_vec(V, L) cleanup_label_vec((V), (L)->size) struct aa_profile; #define VEC_FLAG_TERMINATE 1 int aa_vec_unique(struct aa_profile **vec, int n, int flags); struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len, gfp_t gfp); #define aa_sort_and_merge_vec(N, V) \ aa_sort_and_merge_profiles((N), (struct aa_profile **)(V)) /* struct aa_labelset - set of labels for a namespace * * Labels are reference counted; aa_labelset does not contribute to label * reference counts. Once a label's last refcount is put it is removed from * the set. */ struct aa_labelset { rwlock_t lock; struct rb_root root; }; #define __labelset_for_each(LS, N) \ for ((N) = rb_first(&(LS)->root); (N); (N) = rb_next(N)) enum label_flags { FLAG_HAT = 1, /* profile is a hat */ FLAG_UNCONFINED = 2, /* label unconfined only if all */ FLAG_NULL = 4, /* profile is null learning profile */ FLAG_IX_ON_NAME_ERROR = 8, /* fallback to ix on name lookup fail */ FLAG_IMMUTIBLE = 0x10, /* don't allow changes/replacement */ FLAG_USER_DEFINED = 0x20, /* user based profile - lower privs */ FLAG_NO_LIST_REF = 0x40, /* list doesn't keep profile ref */ FLAG_NS_COUNT = 0x80, /* carries NS ref count */ FLAG_IN_TREE = 0x100, /* label is in tree */ FLAG_PROFILE = 0x200, /* label is a profile */ FLAG_EXPLICIT = 0x400, /* explicit static label */ FLAG_STALE = 0x800, /* replaced/removed */ FLAG_RENAMED = 0x1000, /* label has renaming in it */ FLAG_REVOKED = 0x2000, /* label has revocation in it */ FLAG_DEBUG1 = 0x4000, FLAG_DEBUG2 = 0x8000, /* These flags must correspond with PATH_flags */ /* TODO: add new path flags */ }; struct aa_label; struct aa_proxy { struct kref count; struct aa_label __rcu *label; }; struct label_it { int i, j; }; /* struct aa_label - lazy labeling struct * @count: ref count of active users * @node: rbtree position * @rcu: rcu callback struct * @proxy: is set to the label that replaced this label * @hname: text representation of the label (MAYBE_NULL) * @flags: stale and other flags - values may change under label set lock * @secid: secid that references this label * @size: number of entries in @ent[] * @ent: set of profiles for label, actual size determined by @size */ struct aa_label { struct kref count; struct rb_node node; struct rcu_head rcu; struct aa_proxy *proxy; __counted char *hname; long flags; u32 secid; int size; struct aa_profile *vec[]; }; #define last_error(E, FN) \ do { \ int __subE = (FN); \ if (__subE) \ (E) = __subE; \ } while (0) #define label_isprofile(X) ((X)->flags & FLAG_PROFILE) #define label_unconfined(X) ((X)->flags & FLAG_UNCONFINED) #define unconfined(X) label_unconfined(X) #define label_is_stale(X) ((X)->flags & FLAG_STALE) #define __label_make_stale(X) ((X)->flags |= FLAG_STALE) #define labels_ns(X) (vec_ns(&((X)->vec[0]), (X)->size)) #define labels_set(X) (&labels_ns(X)->labels) #define labels_view(X) labels_ns(X) #define labels_profile(X) ((X)->vec[(X)->size - 1]) int aa_label_next_confined(struct aa_label *l, int i); /* for each profile in a label */ #define label_for_each(I, L, P) \ for ((I).i = 0; ((P) = (L)->vec[(I).i]); ++((I).i)) /* assumes break/goto ended label_for_each */ #define label_for_each_cont(I, L, P) \ for (++((I).i); ((P) = (L)->vec[(I).i]); ++((I).i)) /* for each profile that is enforcing confinement in a label */ #define label_for_each_confined(I, L, P) \ for ((I).i = aa_label_next_confined((L), 0); \ ((P) = (L)->vec[(I).i]); \ (I).i = aa_label_next_confined((L), (I).i + 1)) #define label_for_each_in_merge(I, A, B, P) \ for ((I).i = (I).j = 0; \ ((P) = aa_label_next_in_merge(&(I), (A), (B))); \ ) #define label_for_each_not_in_set(I, SET, SUB, P) \ for ((I).i = (I).j = 0; \ ((P) = __aa_label_next_not_in_set(&(I), (SET), (SUB))); \ ) #define next_in_ns(i, NS, L) \ ({ \ typeof(i) ___i = (i); \ while ((L)->vec[___i] && (L)->vec[___i]->ns != (NS)) \ (___i)++; \ (___i); \ }) #define label_for_each_in_ns(I, NS, L, P) \ for ((I).i = next_in_ns(0, (NS), (L)); \ ((P) = (L)->vec[(I).i]); \ (I).i = next_in_ns((I).i + 1, (NS), (L))) #define fn_for_each_in_ns(L, P, FN) \ ({ \ struct label_it __i; \ struct aa_ns *__ns = labels_ns(L); \ int __E = 0; \ label_for_each_in_ns(__i, __ns, (L), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each_XXX(L, P, FN, ...) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each ## __VA_ARGS__(i, (L), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each(L, P, FN) fn_for_each_XXX(L, P, FN) #define fn_for_each_confined(L, P, FN) fn_for_each_XXX(L, P, FN, _confined) #define fn_for_each2_XXX(L1, L2, P, FN, ...) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each ## __VA_ARGS__(i, (L1), (L2), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each_in_merge(L1, L2, P, FN) \ fn_for_each2_XXX((L1), (L2), P, FN, _in_merge) #define fn_for_each_not_in_set(L1, L2, P, FN) \ fn_for_each2_XXX((L1), (L2), P, FN, _not_in_set) #define LABEL_MEDIATES(L, C) \ ({ \ struct aa_profile *profile; \ struct label_it i; \ int ret = 0; \ label_for_each(i, (L), profile) { \ if (RULE_MEDIATES(&profile->rules, (C))) { \ ret = 1; \ break; \ } \ } \ ret; \ }) void aa_labelset_destroy(struct aa_labelset *ls); void aa_labelset_init(struct aa_labelset *ls); void __aa_labelset_update_subtree(struct aa_ns *ns); void aa_label_destroy(struct aa_label *label); void aa_label_free(struct aa_label *label); void aa_label_kref(struct kref *kref); bool aa_label_init(struct aa_label *label, int size, gfp_t gfp); struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp); bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub); bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub); struct aa_profile *__aa_label_next_not_in_set(struct label_it *I, struct aa_label *set, struct aa_label *sub); bool aa_label_remove(struct aa_label *label); struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *l); bool aa_label_replace(struct aa_label *old, struct aa_label *new); bool aa_label_make_newest(struct aa_labelset *ls, struct aa_label *old, struct aa_label *new); struct aa_profile *aa_label_next_in_merge(struct label_it *I, struct aa_label *a, struct aa_label *b); struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b); struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b, gfp_t gfp); bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp); #define FLAGS_NONE 0 #define FLAG_SHOW_MODE 1 #define FLAG_VIEW_SUBNS 2 #define FLAG_HIDDEN_UNCONFINED 4 #define FLAG_ABS_ROOT 8 int aa_label_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_label *label, int flags); int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_printk(struct aa_label *label, gfp_t gfp); struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, size_t n, gfp_t gfp, bool create, bool force_stack); struct aa_label *aa_label_parse(struct aa_label *base, const char *str, gfp_t gfp, bool create, bool force_stack); static inline const char *aa_label_strn_split(const char *str, int n) { const char *pos; aa_state_t state; state = aa_dfa_matchn_until(stacksplitdfa, DFA_START, str, n, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) return NULL; return pos - 3; } static inline const char *aa_label_str_split(const char *str) { const char *pos; aa_state_t state; state = aa_dfa_match_until(stacksplitdfa, DFA_START, str, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) return NULL; return pos - 3; } struct aa_perms; struct aa_ruleset; int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms); /** * __aa_get_label - get a reference count to uncounted label reference * @l: reference to get a count on * * Returns: pointer to reference OR NULL if race is lost and reference is * being repeated. * Requires: lock held, and the return code MUST be checked */ static inline struct aa_label *__aa_get_label(struct aa_label *l) { if (l && kref_get_unless_zero(&l->count)) return l; return NULL; } static inline struct aa_label *aa_get_label(struct aa_label *l) { if (l) kref_get(&(l->count)); return l; } /** * aa_get_label_rcu - increment refcount on a label that can be replaced * @l: pointer to label that can be replaced (NOT NULL) * * Returns: pointer to a refcounted label. * else NULL if no label */ static inline struct aa_label *aa_get_label_rcu(struct aa_label __rcu **l) { struct aa_label *c; rcu_read_lock(); do { c = rcu_dereference(*l); } while (c && !kref_get_unless_zero(&c->count)); rcu_read_unlock(); return c; } /** * aa_get_newest_label - find the newest version of @l * @l: the label to check for newer versions of * * Returns: refcounted newest version of @l taking into account * replacement, renames and removals * return @l. */ static inline struct aa_label *aa_get_newest_label(struct aa_label *l) { if (!l) return NULL; if (label_is_stale(l)) { struct aa_label *tmp; AA_BUG(!l->proxy); AA_BUG(!l->proxy->label); /* BUG: only way this can happen is @l ref count and its * replacement count have gone to 0 and are on their way * to destruction. ie. we have a refcounting error */ tmp = aa_get_label_rcu(&l->proxy->label); AA_BUG(!tmp); return tmp; } return aa_get_label(l); } static inline void aa_put_label(struct aa_label *l) { if (l) kref_put(&l->count, aa_label_kref); } struct aa_proxy *aa_alloc_proxy(struct aa_label *l, gfp_t gfp); void aa_proxy_kref(struct kref *kref); static inline struct aa_proxy *aa_get_proxy(struct aa_proxy *proxy) { if (proxy) kref_get(&(proxy->count)); return proxy; } static inline void aa_put_proxy(struct aa_proxy *proxy) { if (proxy) kref_put(&proxy->count, aa_proxy_kref); } void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new); #endif /* __AA_LABEL_H */ |
| 23 23 23 23 23 23 23 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | // SPDX-License-Identifier: GPL-2.0-or-later /* Signature verification with an asymmetric key * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "SIG: "fmt #include <keys/asymmetric-subtype.h> #include <linux/export.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/keyctl.h> #include <crypto/public_key.h> #include <keys/user-type.h> #include "asymmetric_keys.h" /* * Destroy a public key signature. */ void public_key_signature_free(struct public_key_signature *sig) { int i; if (sig) { for (i = 0; i < ARRAY_SIZE(sig->auth_ids); i++) kfree(sig->auth_ids[i]); kfree(sig->s); kfree(sig->digest); kfree(sig); } } EXPORT_SYMBOL_GPL(public_key_signature_free); /** * query_asymmetric_key - Get information about an asymmetric key. * @params: Various parameters. * @info: Where to put the information. */ int query_asymmetric_key(const struct kernel_pkey_params *params, struct kernel_pkey_query *info) { const struct asymmetric_key_subtype *subtype; struct key *key = params->key; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->query) return -ENOTSUPP; ret = subtype->query(params, info); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(query_asymmetric_key); /** * verify_signature - Initiate the use of an asymmetric key to verify a signature * @key: The asymmetric key to verify against * @sig: The signature to check * * Returns 0 if successful or else an error. */ int verify_signature(const struct key *key, const struct public_key_signature *sig) { const struct asymmetric_key_subtype *subtype; int ret; pr_devel("==>%s()\n", __func__); if (key->type != &key_type_asymmetric) return -EINVAL; subtype = asymmetric_key_subtype(key); if (!subtype || !key->payload.data[0]) return -EINVAL; if (!subtype->verify_signature) return -ENOTSUPP; ret = subtype->verify_signature(key, sig); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } EXPORT_SYMBOL_GPL(verify_signature); |
| 11 11 11 11 4 4 4 4 4 3 3 3 1 1 1 7 7 1 1 1 1 3 1 7 7 3 3 3 3 3 3 1 1 1 1 3 3 3 3 3 3 3 3 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 | // SPDX-License-Identifier: GPL-2.0-or-later /* * PPP synchronous tty channel driver for Linux. * * This is a ppp channel driver that can be used with tty device drivers * that are frame oriented, such as synchronous HDLC devices. * * Complete PPP frames without encoding/decoding are exchanged between * the channel driver and the device driver. * * The async map IOCTL codes are implemented to keep the user mode * applications happy if they call them. Synchronous PPP does not use * the async maps. * * Copyright 1999 Paul Mackerras. * * Also touched by the grubby hands of Paul Fulghum paulkf@microgate.com * * This driver provides the encapsulation and framing for sending * and receiving PPP frames over sync serial lines. It relies on * the generic PPP layer to give it frames to send and to process * received frames. It implements the PPP line discipline. * * Part of the code in this driver was inspired by the old async-only * PPP driver, written by Michael Callahan and Al Longyear, and * subsequently hacked by Paul Mackerras. * * ==FILEVERSION 20040616== */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/tty.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/refcount.h> #include <linux/unaligned.h> #include <linux/uaccess.h> #define PPP_VERSION "2.4.2" /* Structure for storing local state. */ struct syncppp { struct tty_struct *tty; unsigned int flags; unsigned int rbits; int mru; spinlock_t xmit_lock; spinlock_t recv_lock; unsigned long xmit_flags; u32 xaccm[8]; u32 raccm; unsigned int bytes_sent; unsigned int bytes_rcvd; struct sk_buff *tpkt; unsigned long last_xmit; struct sk_buff_head rqueue; struct tasklet_struct tsk; refcount_t refcnt; struct completion dead_cmp; struct ppp_channel chan; /* interface to generic ppp layer */ }; /* Bit numbers in xmit_flags */ #define XMIT_WAKEUP 0 #define XMIT_FULL 1 /* Bits in rbits */ #define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) #define PPPSYNC_MAX_RQLEN 32 /* arbitrary */ /* * Prototypes. */ static struct sk_buff* ppp_sync_txmunge(struct syncppp *ap, struct sk_buff *); static int ppp_sync_send(struct ppp_channel *chan, struct sk_buff *skb); static int ppp_sync_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg); static void ppp_sync_process(struct tasklet_struct *t); static int ppp_sync_push(struct syncppp *ap); static void ppp_sync_flush_output(struct syncppp *ap); static void ppp_sync_input(struct syncppp *ap, const u8 *buf, const u8 *flags, int count); static const struct ppp_channel_ops sync_ops = { .start_xmit = ppp_sync_send, .ioctl = ppp_sync_ioctl, }; /* * Utility procedure to print a buffer in hex/ascii */ static void ppp_print_buffer (const char *name, const __u8 *buf, int count) { if (name != NULL) printk(KERN_DEBUG "ppp_synctty: %s, count = %d\n", name, count); print_hex_dump_bytes("", DUMP_PREFIX_NONE, buf, count); } /* * Routines implementing the synchronous PPP line discipline. */ /* * We have a potential race on dereferencing tty->disc_data, * because the tty layer provides no locking at all - thus one * cpu could be running ppp_synctty_receive while another * calls ppp_synctty_close, which zeroes tty->disc_data and * frees the memory that ppp_synctty_receive is using. The best * way to fix this is to use a rwlock in the tty struct, but for now * we use a single global rwlock for all ttys in ppp line discipline. * * FIXME: Fixed in tty_io nowadays. */ static DEFINE_RWLOCK(disc_data_lock); static struct syncppp *sp_get(struct tty_struct *tty) { struct syncppp *ap; read_lock(&disc_data_lock); ap = tty->disc_data; if (ap != NULL) refcount_inc(&ap->refcnt); read_unlock(&disc_data_lock); return ap; } static void sp_put(struct syncppp *ap) { if (refcount_dec_and_test(&ap->refcnt)) complete(&ap->dead_cmp); } /* * Called when a tty is put into sync-PPP line discipline. */ static int ppp_sync_open(struct tty_struct *tty) { struct syncppp *ap; int err; int speed; if (tty->ops->write == NULL) return -EOPNOTSUPP; ap = kzalloc(sizeof(*ap), GFP_KERNEL); err = -ENOMEM; if (!ap) goto out; /* initialize the syncppp structure */ ap->tty = tty; ap->mru = PPP_MRU; spin_lock_init(&ap->xmit_lock); spin_lock_init(&ap->recv_lock); ap->xaccm[0] = ~0U; ap->xaccm[3] = 0x60000000U; ap->raccm = ~0U; skb_queue_head_init(&ap->rqueue); tasklet_setup(&ap->tsk, ppp_sync_process); refcount_set(&ap->refcnt, 1); init_completion(&ap->dead_cmp); ap->chan.private = ap; ap->chan.ops = &sync_ops; ap->chan.mtu = PPP_MRU; ap->chan.hdrlen = 2; /* for A/C bytes */ speed = tty_get_baud_rate(tty); ap->chan.speed = speed; err = ppp_register_channel(&ap->chan); if (err) goto out_free; tty->disc_data = ap; tty->receive_room = 65536; return 0; out_free: kfree(ap); out: return err; } /* * Called when the tty is put into another line discipline * or it hangs up. We have to wait for any cpu currently * executing in any of the other ppp_synctty_* routines to * finish before we can call ppp_unregister_channel and free * the syncppp struct. This routine must be called from * process context, not interrupt or softirq context. */ static void ppp_sync_close(struct tty_struct *tty) { struct syncppp *ap; write_lock_irq(&disc_data_lock); ap = tty->disc_data; tty->disc_data = NULL; write_unlock_irq(&disc_data_lock); if (!ap) return; /* * We have now ensured that nobody can start using ap from now * on, but we have to wait for all existing users to finish. * Note that ppp_unregister_channel ensures that no calls to * our channel ops (i.e. ppp_sync_send/ioctl) are in progress * by the time it returns. */ if (!refcount_dec_and_test(&ap->refcnt)) wait_for_completion(&ap->dead_cmp); tasklet_kill(&ap->tsk); ppp_unregister_channel(&ap->chan); skb_queue_purge(&ap->rqueue); kfree_skb(ap->tpkt); kfree(ap); } /* * Called on tty hangup in process context. * * Wait for I/O to driver to complete and unregister PPP channel. * This is already done by the close routine, so just call that. */ static void ppp_sync_hangup(struct tty_struct *tty) { ppp_sync_close(tty); } /* * Read does nothing - no data is ever available this way. * Pppd reads and writes packets via /dev/ppp instead. */ static ssize_t ppp_sync_read(struct tty_struct *tty, struct file *file, u8 *buf, size_t count, void **cookie, unsigned long offset) { return -EAGAIN; } /* * Write on the tty does nothing, the packets all come in * from the ppp generic stuff. */ static ssize_t ppp_sync_write(struct tty_struct *tty, struct file *file, const u8 *buf, size_t count) { return -EAGAIN; } static int ppp_synctty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct syncppp *ap = sp_get(tty); int __user *p = (int __user *)arg; int err, val; if (!ap) return -ENXIO; err = -EFAULT; switch (cmd) { case PPPIOCGCHAN: err = -EFAULT; if (put_user(ppp_channel_index(&ap->chan), p)) break; err = 0; break; case PPPIOCGUNIT: err = -EFAULT; if (put_user(ppp_unit_number(&ap->chan), p)) break; err = 0; break; case TCFLSH: /* flush our buffers and the serial port's buffer */ if (arg == TCIOFLUSH || arg == TCOFLUSH) ppp_sync_flush_output(ap); err = n_tty_ioctl_helper(tty, cmd, arg); break; case FIONREAD: val = 0; if (put_user(val, p)) break; err = 0; break; default: err = tty_mode_ioctl(tty, cmd, arg); break; } sp_put(ap); return err; } /* May sleep, don't call from interrupt level or with interrupts disabled */ static void ppp_sync_receive(struct tty_struct *tty, const u8 *buf, const u8 *cflags, size_t count) { struct syncppp *ap = sp_get(tty); unsigned long flags; if (!ap) return; spin_lock_irqsave(&ap->recv_lock, flags); ppp_sync_input(ap, buf, cflags, count); spin_unlock_irqrestore(&ap->recv_lock, flags); if (!skb_queue_empty(&ap->rqueue)) tasklet_schedule(&ap->tsk); sp_put(ap); tty_unthrottle(tty); } static void ppp_sync_wakeup(struct tty_struct *tty) { struct syncppp *ap = sp_get(tty); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); if (!ap) return; set_bit(XMIT_WAKEUP, &ap->xmit_flags); tasklet_schedule(&ap->tsk); sp_put(ap); } static struct tty_ldisc_ops ppp_sync_ldisc = { .owner = THIS_MODULE, .num = N_SYNC_PPP, .name = "pppsync", .open = ppp_sync_open, .close = ppp_sync_close, .hangup = ppp_sync_hangup, .read = ppp_sync_read, .write = ppp_sync_write, .ioctl = ppp_synctty_ioctl, .receive_buf = ppp_sync_receive, .write_wakeup = ppp_sync_wakeup, }; static int __init ppp_sync_init(void) { int err; err = tty_register_ldisc(&ppp_sync_ldisc); if (err != 0) printk(KERN_ERR "PPP_sync: error %d registering line disc.\n", err); return err; } /* * The following routines provide the PPP channel interface. */ static int ppp_sync_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) { struct syncppp *ap = chan->private; int err, val; u32 accm[8]; void __user *argp = (void __user *)arg; u32 __user *p = argp; err = -EFAULT; switch (cmd) { case PPPIOCGFLAGS: val = ap->flags | ap->rbits; if (put_user(val, (int __user *) argp)) break; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, (int __user *) argp)) break; ap->flags = val & ~SC_RCV_BITS; spin_lock_irq(&ap->recv_lock); ap->rbits = val & SC_RCV_BITS; spin_unlock_irq(&ap->recv_lock); err = 0; break; case PPPIOCGASYNCMAP: if (put_user(ap->xaccm[0], p)) break; err = 0; break; case PPPIOCSASYNCMAP: if (get_user(ap->xaccm[0], p)) break; err = 0; break; case PPPIOCGRASYNCMAP: if (put_user(ap->raccm, p)) break; err = 0; break; case PPPIOCSRASYNCMAP: if (get_user(ap->raccm, p)) break; err = 0; break; case PPPIOCGXASYNCMAP: if (copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm))) break; err = 0; break; case PPPIOCSXASYNCMAP: if (copy_from_user(accm, argp, sizeof(accm))) break; accm[2] &= ~0x40000000U; /* can't escape 0x5e */ accm[3] |= 0x60000000U; /* must escape 0x7d, 0x7e */ memcpy(ap->xaccm, accm, sizeof(ap->xaccm)); err = 0; break; case PPPIOCGMRU: if (put_user(ap->mru, (int __user *) argp)) break; err = 0; break; case PPPIOCSMRU: if (get_user(val, (int __user *) argp)) break; if (val > U16_MAX) { err = -EINVAL; break; } if (val < PPP_MRU) val = PPP_MRU; ap->mru = val; err = 0; break; default: err = -ENOTTY; } return err; } /* * This is called at softirq level to deliver received packets * to the ppp_generic code, and to tell the ppp_generic code * if we can accept more output now. */ static void ppp_sync_process(struct tasklet_struct *t) { struct syncppp *ap = from_tasklet(ap, t, tsk); struct sk_buff *skb; /* process received packets */ while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->len == 0) { /* zero length buffers indicate error */ ppp_input_error(&ap->chan, 0); kfree_skb(skb); } else ppp_input(&ap->chan, skb); } /* try to push more stuff out */ if (test_bit(XMIT_WAKEUP, &ap->xmit_flags) && ppp_sync_push(ap)) ppp_output_wakeup(&ap->chan); } /* * Procedures for encapsulation and framing. */ static struct sk_buff* ppp_sync_txmunge(struct syncppp *ap, struct sk_buff *skb) { int proto; unsigned char *data; int islcp; /* Ensure we can safely access protocol field and LCP code */ if (!pskb_may_pull(skb, 3)) { kfree_skb(skb); return NULL; } data = skb->data; proto = get_unaligned_be16(data); /* LCP packets with codes between 1 (configure-request) * and 7 (code-reject) must be sent as though no options * have been negotiated. */ islcp = proto == PPP_LCP && 1 <= data[2] && data[2] <= 7; /* compress protocol field if option enabled */ if (data[0] == 0 && (ap->flags & SC_COMP_PROT) && !islcp) skb_pull(skb,1); /* prepend address/control fields if necessary */ if ((ap->flags & SC_COMP_AC) == 0 || islcp) { if (skb_headroom(skb) < 2) { struct sk_buff *npkt = dev_alloc_skb(skb->len + 2); if (npkt == NULL) { kfree_skb(skb); return NULL; } skb_reserve(npkt,2); skb_copy_from_linear_data(skb, skb_put(npkt, skb->len), skb->len); consume_skb(skb); skb = npkt; } skb_push(skb,2); skb->data[0] = PPP_ALLSTATIONS; skb->data[1] = PPP_UI; } ap->last_xmit = jiffies; if (skb && ap->flags & SC_LOG_OUTPKT) ppp_print_buffer ("send buffer", skb->data, skb->len); return skb; } /* * Transmit-side routines. */ /* * Send a packet to the peer over an sync tty line. * Returns 1 iff the packet was accepted. * If the packet was not accepted, we will call ppp_output_wakeup * at some later time. */ static int ppp_sync_send(struct ppp_channel *chan, struct sk_buff *skb) { struct syncppp *ap = chan->private; ppp_sync_push(ap); if (test_and_set_bit(XMIT_FULL, &ap->xmit_flags)) return 0; /* already full */ skb = ppp_sync_txmunge(ap, skb); if (skb != NULL) ap->tpkt = skb; else clear_bit(XMIT_FULL, &ap->xmit_flags); ppp_sync_push(ap); return 1; } /* * Push as much data as possible out to the tty. */ static int ppp_sync_push(struct syncppp *ap) { int sent, done = 0; struct tty_struct *tty = ap->tty; int tty_stuffed = 0; if (!spin_trylock_bh(&ap->xmit_lock)) return 0; for (;;) { if (test_and_clear_bit(XMIT_WAKEUP, &ap->xmit_flags)) tty_stuffed = 0; if (!tty_stuffed && ap->tpkt) { set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); sent = tty->ops->write(tty, ap->tpkt->data, ap->tpkt->len); if (sent < 0) goto flush; /* error, e.g. loss of CD */ if (sent < ap->tpkt->len) { tty_stuffed = 1; } else { consume_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } continue; } /* haven't made any progress */ spin_unlock_bh(&ap->xmit_lock); if (!(test_bit(XMIT_WAKEUP, &ap->xmit_flags) || (!tty_stuffed && ap->tpkt))) break; if (!spin_trylock_bh(&ap->xmit_lock)) break; } return done; flush: if (ap->tpkt) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } spin_unlock_bh(&ap->xmit_lock); return done; } /* * Flush output from our internal buffers. * Called for the TCFLSH ioctl. */ static void ppp_sync_flush_output(struct syncppp *ap) { int done = 0; spin_lock_bh(&ap->xmit_lock); if (ap->tpkt != NULL) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } spin_unlock_bh(&ap->xmit_lock); if (done) ppp_output_wakeup(&ap->chan); } /* * Receive-side routines. */ /* called when the tty driver has data for us. * * Data is frame oriented: each call to ppp_sync_input is considered * a whole frame. If the 1st flag byte is non-zero then the whole * frame is considered to be in error and is tossed. */ static void ppp_sync_input(struct syncppp *ap, const u8 *buf, const u8 *flags, int count) { struct sk_buff *skb; unsigned char *p; if (count == 0) return; if (ap->flags & SC_LOG_INPKT) ppp_print_buffer ("receive buffer", buf, count); /* stuff the chars in the skb */ skb = dev_alloc_skb(ap->mru + PPP_HDRLEN + 2); if (!skb) { printk(KERN_ERR "PPPsync: no memory (input pkt)\n"); goto err; } /* Try to get the payload 4-byte aligned */ if (buf[0] != PPP_ALLSTATIONS) skb_reserve(skb, 2 + (buf[0] & 1)); if (flags && *flags) { /* error flag set, ignore frame */ goto err; } else if (count > skb_tailroom(skb)) { /* packet overflowed MRU */ goto err; } skb_put_data(skb, buf, count); /* strip address/control field if present */ p = skb->data; if (skb->len >= 2 && p[0] == PPP_ALLSTATIONS && p[1] == PPP_UI) { /* chop off address/control */ if (skb->len < 3) goto err; p = skb_pull(skb, 2); } /* PPP packet length should be >= 2 bytes when protocol field is not * compressed. */ if (!(p[0] & 0x01) && skb->len < 2) goto err; /* queue the frame to be processed */ skb_queue_tail(&ap->rqueue, skb); return; err: /* queue zero length packet as error indication */ if (skb || (skb = dev_alloc_skb(0))) { skb_trim(skb, 0); skb_queue_tail(&ap->rqueue, skb); } } static void __exit ppp_sync_cleanup(void) { tty_unregister_ldisc(&ppp_sync_ldisc); } module_init(ppp_sync_init); module_exit(ppp_sync_cleanup); MODULE_DESCRIPTION("PPP synchronous TTY channel module"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_SYNC_PPP); |
| 1 604 606 605 603 552 549 62 62 53 54 250 248 43 43 231 233 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | // SPDX-License-Identifier: GPL-2.0 /* * Out-of-line refcount functions. */ #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/bug.h> #define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { refcount_set(r, REFCOUNT_SATURATED); switch (t) { case REFCOUNT_ADD_NOT_ZERO_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_UAF: REFCOUNT_WARN("addition on 0; use-after-free"); break; case REFCOUNT_SUB_UAF: REFCOUNT_WARN("underflow; use-after-free"); break; case REFCOUNT_DEC_LEAK: REFCOUNT_WARN("decrement hit 0; leaking memory"); break; default: REFCOUNT_WARN("unknown saturation event!?"); } } EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 * @r: the refcount * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * * Like all decrement operations, it provides release memory order and provides * a control dependency. * * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. * * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { int val = 1; return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL(refcount_dec_if_one); /** * refcount_dec_not_one - decrement a refcount if it is not 1 * @r: the refcount * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) * * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { unsigned int new, val = atomic_read(&r->refs); do { if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) return false; new = val - 1; if (new > val) { WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); return true; } } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL(refcount_dec_not_one); /** * refcount_dec_and_mutex_lock - return holding mutex if able to decrement * refcount to 0 * @r: the refcount * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold mutex if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { if (refcount_dec_not_one(r)) return false; mutex_lock(lock); if (!refcount_dec_and_test(r)) { mutex_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_mutex_lock); /** * refcount_dec_and_lock - return holding spinlock if able to decrement * refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { if (refcount_dec_not_one(r)) return false; spin_lock(lock); if (!refcount_dec_and_test(r)) { spin_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock); /** * refcount_dec_and_lock_irqsave - return holding spinlock with disabled * interrupts if able to decrement refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * @flags: saved IRQ-flags if the is acquired * * Same as refcount_dec_and_lock() above except that the spinlock is acquired * with disabled interrupts. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) { if (refcount_dec_not_one(r)) return false; spin_lock_irqsave(lock, *flags); if (!refcount_dec_and_test(r)) { spin_unlock_irqrestore(lock, *flags); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock_irqsave); |
| 1 1 1 1 1 1 1 1 1 1 1 1 4 1 3 3 4 1 6 6 6 6 11 10 10 4 7 4 3 3 4 4 4 10 4 2 2 4 5 5 5 5 5 4 1 1 1 4 4 4 5 5 5 2 1 4 2 2 2 2 2 2 2 2 2 5 6 6 4 3 2 2 6 8 8 6 1 5 2 2 2 2 2 2 2 2 8 5 1 4 4 4 1 3 1 2 1 1 2 1 3 5 5 5 5 4 4 3 3 3 3 3 1 2 2 2 2 2 3 619 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_api.c Packet classifier API. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/jhash.h> #include <linux/rculist.h> #include <linux/rhashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_pedit.h> #include <net/tc_act/tc_mirred.h> #include <net/tc_act/tc_vlan.h> #include <net/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_csum.h> #include <net/tc_act/tc_gact.h> #include <net/tc_act/tc_police.h> #include <net/tc_act/tc_sample.h> #include <net/tc_act/tc_skbedit.h> #include <net/tc_act/tc_ct.h> #include <net/tc_act/tc_mpls.h> #include <net/tc_act/tc_gate.h> #include <net/flow_offload.h> #include <net/tc_wrapper.h> /* The list of all installed classifier types */ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); static struct xarray tcf_exts_miss_cookies_xa; struct tcf_exts_miss_cookie_node { const struct tcf_chain *chain; const struct tcf_proto *tp; const struct tcf_exts *exts; u32 chain_index; u32 tp_prio; u32 handle; u32 miss_cookie_base; struct rcu_head rcu; }; /* Each tc action entry cookie will be comprised of 32bit miss_cookie_base + * action index in the exts tc actions array. */ union tcf_exts_miss_cookie { struct { u32 miss_cookie_base; u32 act_index; }; u64 miss_cookie; }; #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static int tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, u32 handle) { struct tcf_exts_miss_cookie_node *n; static u32 next; int err; if (WARN_ON(!handle || !tp->ops->get_exts)) return -EINVAL; n = kzalloc(sizeof(*n), GFP_KERNEL); if (!n) return -ENOMEM; n->chain_index = tp->chain->index; n->chain = tp->chain; n->tp_prio = tp->prio; n->tp = tp; n->exts = exts; n->handle = handle; err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); if (err < 0) goto err_xa_alloc; exts->miss_cookie_node = n; return 0; err_xa_alloc: kfree(n); return err; } static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) { struct tcf_exts_miss_cookie_node *n; if (!exts->miss_cookie_node) return; n = exts->miss_cookie_node; xa_erase(&tcf_exts_miss_cookies_xa, n->miss_cookie_base); kfree_rcu(n, rcu); } static struct tcf_exts_miss_cookie_node * tcf_exts_miss_cookie_lookup(u64 miss_cookie, int *act_index) { union tcf_exts_miss_cookie mc = { .miss_cookie = miss_cookie, }; *act_index = mc.act_index; return xa_load(&tcf_exts_miss_cookies_xa, mc.miss_cookie_base); } #else /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ static int tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, u32 handle) { return 0; } static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) { } #endif /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ static u64 tcf_exts_miss_cookie_get(u32 miss_cookie_base, int act_index) { union tcf_exts_miss_cookie mc = { .act_index = act_index, }; if (!miss_cookie_base) return 0; mc.miss_cookie_base = miss_cookie_base; return mc.miss_cookie; } #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc); EXPORT_SYMBOL(tc_skb_ext_tc); void tc_skb_ext_tc_enable(void) { static_branch_inc(&tc_skb_ext_tc); } EXPORT_SYMBOL(tc_skb_ext_tc_enable); void tc_skb_ext_tc_disable(void) { static_branch_dec(&tc_skb_ext_tc); } EXPORT_SYMBOL(tc_skb_ext_tc_disable); #endif static u32 destroy_obj_hashfn(const struct tcf_proto *tp) { return jhash_3words(tp->chain->index, tp->prio, (__force __u32)tp->protocol, 0); } static void tcf_proto_signal_destroying(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_block *block = chain->block; mutex_lock(&block->proto_destroy_lock); hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node, destroy_obj_hashfn(tp)); mutex_unlock(&block->proto_destroy_lock); } static bool tcf_proto_cmp(const struct tcf_proto *tp1, const struct tcf_proto *tp2) { return tp1->chain->index == tp2->chain->index && tp1->prio == tp2->prio && tp1->protocol == tp2->protocol; } static bool tcf_proto_exists_destroying(struct tcf_chain *chain, struct tcf_proto *tp) { u32 hash = destroy_obj_hashfn(tp); struct tcf_proto *iter; bool found = false; rcu_read_lock(); hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter, destroy_ht_node, hash) { if (tcf_proto_cmp(tp, iter)) { found = true; break; } } rcu_read_unlock(); return found; } static void tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_block *block = chain->block; mutex_lock(&block->proto_destroy_lock); if (hash_hashed(&tp->destroy_ht_node)) hash_del_rcu(&tp->destroy_ht_node); mutex_unlock(&block->proto_destroy_lock); } /* Find classifier type by string name */ static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) { const struct tcf_proto_ops *t, *res = NULL; if (kind) { read_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { if (strcmp(kind, t->kind) == 0) { if (try_module_get(t->owner)) res = t; break; } } read_unlock(&cls_mod_lock); } return res; } static const struct tcf_proto_ops * tcf_proto_lookup_ops(const char *kind, bool rtnl_held, struct netlink_ext_ack *extack) { const struct tcf_proto_ops *ops; ops = __tcf_proto_lookup_ops(kind); if (ops) return ops; #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); request_module(NET_CLS_ALIAS_PREFIX "%s", kind); if (rtnl_held) rtnl_lock(); ops = __tcf_proto_lookup_ops(kind); /* We dropped the RTNL semaphore in order to perform * the module load. So, even if we succeeded in loading * the module we have to replay the request. We indicate * this using -EAGAIN. */ if (ops) { module_put(ops->owner); return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "TC classifier not found"); return ERR_PTR(-ENOENT); } /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; int rc = -EEXIST; write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) if (!strcmp(ops->kind, t->kind)) goto out; list_add_tail(&ops->head, &tcf_proto_base); rc = 0; out: write_unlock(&cls_mod_lock); return rc; } EXPORT_SYMBOL(register_tcf_proto_ops); static struct workqueue_struct *tc_filter_wq; void unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; int rc = -ENOENT; /* Wait for outstanding call_rcu()s, if any, from a * tcf_proto_ops's destroy() handler. */ rcu_barrier(); flush_workqueue(tc_filter_wq); write_lock(&cls_mod_lock); list_for_each_entry(t, &tcf_proto_base, head) { if (t == ops) { list_del(&t->head); rc = 0; break; } } write_unlock(&cls_mod_lock); WARN(rc, "unregister tc filter kind(%s) failed %d\n", ops->kind, rc); } EXPORT_SYMBOL(unregister_tcf_proto_ops); bool tcf_queue_work(struct rcu_work *rwork, work_func_t func) { INIT_RCU_WORK(rwork, func); return queue_rcu_work(tc_filter_wq, rwork); } EXPORT_SYMBOL(tcf_queue_work); /* Select new prio value from the range, managed by kernel. */ static inline u32 tcf_auto_prio(struct tcf_proto *tp) { u32 first = TC_H_MAKE(0xC0000000U, 0U); if (tp) first = tp->prio - 1; return TC_H_MAJ(first); } static bool tcf_proto_check_kind(struct nlattr *kind, char *name) { if (kind) return nla_strscpy(name, kind, IFNAMSIZ) < 0; memset(name, 0, IFNAMSIZ); return false; } static bool tcf_proto_is_unlocked(const char *kind) { const struct tcf_proto_ops *ops; bool ret; if (strlen(kind) == 0) return false; ops = tcf_proto_lookup_ops(kind, false, NULL); /* On error return false to take rtnl lock. Proto lookup/create * functions will perform lookup again and properly handle errors. */ if (IS_ERR(ops)) return false; ret = !!(ops->flags & TCF_PROTO_OPS_DOIT_UNLOCKED); module_put(ops->owner); return ret; } static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, u32 prio, struct tcf_chain *chain, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcf_proto *tp; int err; tp = kzalloc(sizeof(*tp), GFP_KERNEL); if (!tp) return ERR_PTR(-ENOBUFS); tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack); if (IS_ERR(tp->ops)) { err = PTR_ERR(tp->ops); goto errout; } tp->classify = tp->ops->classify; tp->protocol = protocol; tp->prio = prio; tp->chain = chain; tp->usesw = !tp->ops->reoffload; spin_lock_init(&tp->lock); refcount_set(&tp->refcnt, 1); err = tp->ops->init(tp); if (err) { module_put(tp->ops->owner); goto errout; } return tp; errout: kfree(tp); return ERR_PTR(err); } static void tcf_proto_get(struct tcf_proto *tp) { refcount_inc(&tp->refcnt); } static void tcf_proto_count_usesw(struct tcf_proto *tp, bool add) { #ifdef CONFIG_NET_CLS_ACT struct tcf_block *block = tp->chain->block; bool counted = false; if (!add) { if (tp->usesw && tp->counted) { if (!atomic_dec_return(&block->useswcnt)) static_branch_dec(&tcf_sw_enabled_key); tp->counted = false; } return; } spin_lock(&tp->lock); if (tp->usesw && !tp->counted) { counted = true; tp->counted = true; } spin_unlock(&tp->lock); if (counted && atomic_inc_return(&block->useswcnt) == 1) static_branch_inc(&tcf_sw_enabled_key); #endif } static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); tcf_proto_count_usesw(tp, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); module_put(tp->ops->owner); kfree_rcu(tp, rcu); } static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { if (refcount_dec_and_test(&tp->refcnt)) tcf_proto_destroy(tp, rtnl_held, true, extack); } static bool tcf_proto_check_delete(struct tcf_proto *tp) { if (tp->ops->delete_empty) return tp->ops->delete_empty(tp); tp->deleting = true; return tp->deleting; } static void tcf_proto_mark_delete(struct tcf_proto *tp) { spin_lock(&tp->lock); tp->deleting = true; spin_unlock(&tp->lock); } static bool tcf_proto_is_deleting(struct tcf_proto *tp) { bool deleting; spin_lock(&tp->lock); deleting = tp->deleting; spin_unlock(&tp->lock); return deleting; } #define ASSERT_BLOCK_LOCKED(block) \ lockdep_assert_held(&(block)->lock) struct tcf_filter_chain_list_item { struct list_head list; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; }; static struct tcf_chain *tcf_chain_create(struct tcf_block *block, u32 chain_index) { struct tcf_chain *chain; ASSERT_BLOCK_LOCKED(block); chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; list_add_tail_rcu(&chain->list, &block->chain_list); mutex_init(&chain->filter_chain_lock); chain->block = block; chain->index = chain_index; chain->refcnt = 1; if (!chain->index) block->chain0.chain = chain; return chain; } static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item, struct tcf_proto *tp_head) { if (item->chain_head_change) item->chain_head_change(tp_head, item->chain_head_change_priv); } static void tcf_chain0_head_change(struct tcf_chain *chain, struct tcf_proto *tp_head) { struct tcf_filter_chain_list_item *item; struct tcf_block *block = chain->block; if (chain->index) return; mutex_lock(&block->lock); list_for_each_entry(item, &block->chain0.filter_chain_list, list) tcf_chain_head_change_item(item, tp_head); mutex_unlock(&block->lock); } /* Returns true if block can be safely freed. */ static bool tcf_chain_detach(struct tcf_chain *chain) { struct tcf_block *block = chain->block; ASSERT_BLOCK_LOCKED(block); list_del_rcu(&chain->list); if (!chain->index) block->chain0.chain = NULL; if (list_empty(&block->chain_list) && refcount_read(&block->refcnt) == 0) return true; return false; } static void tcf_block_destroy(struct tcf_block *block) { mutex_destroy(&block->lock); mutex_destroy(&block->proto_destroy_lock); xa_destroy(&block->ports); kfree_rcu(block, rcu); } static void tcf_chain_destroy(struct tcf_chain *chain, bool free_block) { struct tcf_block *block = chain->block; mutex_destroy(&chain->filter_chain_lock); kfree_rcu(chain, rcu); if (free_block) tcf_block_destroy(block); } static void tcf_chain_hold(struct tcf_chain *chain) { ASSERT_BLOCK_LOCKED(chain->block); ++chain->refcnt; } static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain) { ASSERT_BLOCK_LOCKED(chain->block); /* In case all the references are action references, this * chain should not be shown to the user. */ return chain->refcnt == chain->action_refcnt; } static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, u32 chain_index) { struct tcf_chain *chain; ASSERT_BLOCK_LOCKED(block); list_for_each_entry(chain, &block->chain_list, list) { if (chain->index == chain_index) return chain; } return NULL; } #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block, u32 chain_index) { struct tcf_chain *chain; list_for_each_entry_rcu(chain, &block->chain_list, list) { if (chain->index == chain_index) return chain; } return NULL; } #endif static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, u32 seq, u16 flags, int event, bool unicast, struct netlink_ext_ack *extack); static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create, bool by_act) { struct tcf_chain *chain = NULL; bool is_first_reference; mutex_lock(&block->lock); chain = tcf_chain_lookup(block, chain_index); if (chain) { tcf_chain_hold(chain); } else { if (!create) goto errout; chain = tcf_chain_create(block, chain_index); if (!chain) goto errout; } if (by_act) ++chain->action_refcnt; is_first_reference = chain->refcnt - chain->action_refcnt == 1; mutex_unlock(&block->lock); /* Send notification only in case we got the first * non-action reference. Until then, the chain acts only as * a placeholder for actions pointing to it and user ought * not know about them. */ if (is_first_reference && !by_act) tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, RTM_NEWCHAIN, false, NULL); return chain; errout: mutex_unlock(&block->lock); return chain; } static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create) { return __tcf_chain_get(block, chain_index, create, false); } struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) { return __tcf_chain_get(block, chain_index, true, true); } EXPORT_SYMBOL(tcf_chain_get_by_act); static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv); static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct tcf_block *block, struct sk_buff *oskb, u32 seq, u16 flags); static void __tcf_chain_put(struct tcf_chain *chain, bool by_act, bool explicitly_created) { struct tcf_block *block = chain->block; const struct tcf_proto_ops *tmplt_ops; unsigned int refcnt, non_act_refcnt; bool free_block = false; void *tmplt_priv; mutex_lock(&block->lock); if (explicitly_created) { if (!chain->explicitly_created) { mutex_unlock(&block->lock); return; } chain->explicitly_created = false; } if (by_act) chain->action_refcnt--; /* tc_chain_notify_delete can't be called while holding block lock. * However, when block is unlocked chain can be changed concurrently, so * save these to temporary variables. */ refcnt = --chain->refcnt; non_act_refcnt = refcnt - chain->action_refcnt; tmplt_ops = chain->tmplt_ops; tmplt_priv = chain->tmplt_priv; if (non_act_refcnt == chain->explicitly_created && !by_act) { if (non_act_refcnt == 0) tc_chain_notify_delete(tmplt_ops, tmplt_priv, chain->index, block, NULL, 0, 0); /* Last reference to chain, no need to lock. */ chain->flushing = false; } if (refcnt == 0) free_block = tcf_chain_detach(chain); mutex_unlock(&block->lock); if (refcnt == 0) { tc_chain_tmplt_del(tmplt_ops, tmplt_priv); tcf_chain_destroy(chain, free_block); } } static void tcf_chain_put(struct tcf_chain *chain) { __tcf_chain_put(chain, false, false); } void tcf_chain_put_by_act(struct tcf_chain *chain) { __tcf_chain_put(chain, true, false); } EXPORT_SYMBOL(tcf_chain_put_by_act); static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) { __tcf_chain_put(chain, false, true); } static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) { struct tcf_proto *tp, *tp_next; mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_dereference(chain->filter_chain, chain); while (tp) { tp_next = rcu_dereference_protected(tp->next, 1); tcf_proto_signal_destroying(chain, tp); tp = tp_next; } tp = tcf_chain_dereference(chain->filter_chain, chain); RCU_INIT_POINTER(chain->filter_chain, NULL); tcf_chain0_head_change(chain, NULL); chain->flushing = true; mutex_unlock(&chain->filter_chain_lock); while (tp) { tp_next = rcu_dereference_protected(tp->next, 1); tcf_proto_put(tp, rtnl_held, NULL); tp = tp_next; } } static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); static void tcf_block_offload_init(struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, enum flow_block_command command, enum flow_block_binder_type binder_type, struct flow_block *flow_block, bool shared, struct netlink_ext_ack *extack) { bo->net = dev_net(dev); bo->command = command; bo->binder_type = binder_type; bo->block = flow_block; bo->block_shared = shared; bo->extack = extack; bo->sch = sch; bo->cb_list_head = &flow_block->cb_list; INIT_LIST_HEAD(&bo->cb_list); } static void tcf_block_unbind(struct tcf_block *block, struct flow_block_offload *bo); static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { struct tcf_block *block = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct Qdisc *sch = block_cb->indr.sch; struct netlink_ext_ack extack = {}; struct flow_block_offload bo = {}; tcf_block_offload_init(&bo, dev, sch, FLOW_BLOCK_UNBIND, block_cb->indr.binder_type, &block->flow_block, tcf_block_shared(block), &extack); rtnl_lock(); down_write(&block->cb_lock); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); tcf_block_unbind(block, &bo); up_write(&block->cb_lock); rtnl_unlock(); } static bool tcf_block_offload_in_use(struct tcf_block *block) { return atomic_read(&block->offloadcnt); } static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct Qdisc *sch, struct tcf_block_ext_info *ei, enum flow_block_command command, struct netlink_ext_ack *extack) { struct flow_block_offload bo = {}; tcf_block_offload_init(&bo, dev, sch, command, ei->binder_type, &block->flow_block, tcf_block_shared(block), extack); if (dev->netdev_ops->ndo_setup_tc) { int err; err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) { if (err != -EOPNOTSUPP) NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); return err; } return tcf_block_setup(block, &bo); } flow_indr_dev_setup_offload(dev, sch, TC_SETUP_BLOCK, block, &bo, tc_block_indr_cleanup); tcf_block_setup(block, &bo); return -EOPNOTSUPP; } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { struct net_device *dev = q->dev_queue->dev; int err; down_write(&block->cb_lock); /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ if (dev->netdev_ops->ndo_setup_tc && !tc_can_offload(dev) && tcf_block_offload_in_use(block)) { NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); err = -EOPNOTSUPP; goto err_unlock; } err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; if (err) goto err_unlock; up_write(&block->cb_lock); return 0; no_offload_dev_inc: if (tcf_block_offload_in_use(block)) goto err_unlock; err = 0; block->nooffloaddevcnt++; err_unlock: up_write(&block->cb_lock); return err; } static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { struct net_device *dev = q->dev_queue->dev; int err; down_write(&block->cb_lock); err = tcf_block_offload_cmd(block, dev, q, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; up_write(&block->cb_lock); return; no_offload_dev_dec: WARN_ON(block->nooffloaddevcnt-- == 0); up_write(&block->cb_lock); } static int tcf_chain0_head_change_cb_add(struct tcf_block *block, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { struct tcf_filter_chain_list_item *item; struct tcf_chain *chain0; item = kmalloc(sizeof(*item), GFP_KERNEL); if (!item) { NL_SET_ERR_MSG(extack, "Memory allocation for head change callback item failed"); return -ENOMEM; } item->chain_head_change = ei->chain_head_change; item->chain_head_change_priv = ei->chain_head_change_priv; mutex_lock(&block->lock); chain0 = block->chain0.chain; if (chain0) tcf_chain_hold(chain0); else list_add(&item->list, &block->chain0.filter_chain_list); mutex_unlock(&block->lock); if (chain0) { struct tcf_proto *tp_head; mutex_lock(&chain0->filter_chain_lock); tp_head = tcf_chain_dereference(chain0->filter_chain, chain0); if (tp_head) tcf_chain_head_change_item(item, tp_head); mutex_lock(&block->lock); list_add(&item->list, &block->chain0.filter_chain_list); mutex_unlock(&block->lock); mutex_unlock(&chain0->filter_chain_lock); tcf_chain_put(chain0); } return 0; } static void tcf_chain0_head_change_cb_del(struct tcf_block *block, struct tcf_block_ext_info *ei) { struct tcf_filter_chain_list_item *item; mutex_lock(&block->lock); list_for_each_entry(item, &block->chain0.filter_chain_list, list) { if ((!ei->chain_head_change && !ei->chain_head_change_priv) || (item->chain_head_change == ei->chain_head_change && item->chain_head_change_priv == ei->chain_head_change_priv)) { if (block->chain0.chain) tcf_chain_head_change_item(item, NULL); list_del(&item->list); mutex_unlock(&block->lock); kfree(item); return; } } mutex_unlock(&block->lock); WARN_ON(1); } struct tcf_net { spinlock_t idr_lock; /* Protects idr */ struct idr idr; }; static unsigned int tcf_net_id; static int tcf_block_insert(struct tcf_block *block, struct net *net, struct netlink_ext_ack *extack) { struct tcf_net *tn = net_generic(net, tcf_net_id); int err; idr_preload(GFP_KERNEL); spin_lock(&tn->idr_lock); err = idr_alloc_u32(&tn->idr, block, &block->index, block->index, GFP_NOWAIT); spin_unlock(&tn->idr_lock); idr_preload_end(); return err; } static void tcf_block_remove(struct tcf_block *block, struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); spin_lock(&tn->idr_lock); idr_remove(&tn->idr, block->index); spin_unlock(&tn->idr_lock); } static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, u32 block_index, struct netlink_ext_ack *extack) { struct tcf_block *block; block = kzalloc(sizeof(*block), GFP_KERNEL); if (!block) { NL_SET_ERR_MSG(extack, "Memory allocation for block failed"); return ERR_PTR(-ENOMEM); } mutex_init(&block->lock); mutex_init(&block->proto_destroy_lock); init_rwsem(&block->cb_lock); flow_block_init(&block->flow_block); INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->owner_list); INIT_LIST_HEAD(&block->chain0.filter_chain_list); refcount_set(&block->refcnt, 1); block->net = net; block->index = block_index; xa_init(&block->ports); /* Don't store q pointer for blocks which are shared */ if (!tcf_block_shared(block)) block->q = q; return block; } struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) { struct tcf_net *tn = net_generic(net, tcf_net_id); return idr_find(&tn->idr, block_index); } EXPORT_SYMBOL(tcf_block_lookup); static struct tcf_block *tcf_block_refcnt_get(struct net *net, u32 block_index) { struct tcf_block *block; rcu_read_lock(); block = tcf_block_lookup(net, block_index); if (block && !refcount_inc_not_zero(&block->refcnt)) block = NULL; rcu_read_unlock(); return block; } static struct tcf_chain * __tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain) { mutex_lock(&block->lock); if (chain) chain = list_is_last(&chain->list, &block->chain_list) ? NULL : list_next_entry(chain, list); else chain = list_first_entry_or_null(&block->chain_list, struct tcf_chain, list); /* skip all action-only chains */ while (chain && tcf_chain_held_by_acts_only(chain)) chain = list_is_last(&chain->list, &block->chain_list) ? NULL : list_next_entry(chain, list); if (chain) tcf_chain_hold(chain); mutex_unlock(&block->lock); return chain; } /* Function to be used by all clients that want to iterate over all chains on * block. It properly obtains block->lock and takes reference to chain before * returning it. Users of this function must be tolerant to concurrent chain * insertion/deletion or ensure that no concurrent chain modification is * possible. Note that all netlink dump callbacks cannot guarantee to provide * consistent dump because rtnl lock is released each time skb is filled with * data and sent to user-space. */ struct tcf_chain * tcf_get_next_chain(struct tcf_block *block, struct tcf_chain *chain) { struct tcf_chain *chain_next = __tcf_get_next_chain(block, chain); if (chain) tcf_chain_put(chain); return chain_next; } EXPORT_SYMBOL(tcf_get_next_chain); static struct tcf_proto * __tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) { u32 prio = 0; ASSERT_RTNL(); mutex_lock(&chain->filter_chain_lock); if (!tp) { tp = tcf_chain_dereference(chain->filter_chain, chain); } else if (tcf_proto_is_deleting(tp)) { /* 'deleting' flag is set and chain->filter_chain_lock was * unlocked, which means next pointer could be invalid. Restart * search. */ prio = tp->prio + 1; tp = tcf_chain_dereference(chain->filter_chain, chain); for (; tp; tp = tcf_chain_dereference(tp->next, chain)) if (!tp->deleting && tp->prio >= prio) break; } else { tp = tcf_chain_dereference(tp->next, chain); } if (tp) tcf_proto_get(tp); mutex_unlock(&chain->filter_chain_lock); return tp; } /* Function to be used by all clients that want to iterate over all tp's on * chain. Users of this function must be tolerant to concurrent tp * insertion/deletion or ensure that no concurrent chain modification is * possible. Note that all netlink dump callbacks cannot guarantee to provide * consistent dump because rtnl lock is released each time skb is filled with * data and sent to user-space. */ struct tcf_proto * tcf_get_next_proto(struct tcf_chain *chain, struct tcf_proto *tp) { struct tcf_proto *tp_next = __tcf_get_next_proto(chain, tp); if (tp) tcf_proto_put(tp, true, NULL); return tp_next; } EXPORT_SYMBOL(tcf_get_next_proto); static void tcf_block_flush_all_chains(struct tcf_block *block, bool rtnl_held) { struct tcf_chain *chain; /* Last reference to block. At this point chains cannot be added or * removed concurrently. */ for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { tcf_chain_put_explicitly_created(chain); tcf_chain_flush(chain, rtnl_held); } } /* Lookup Qdisc and increments its reference counter. * Set parent, if necessary. */ static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, u32 *parent, int ifindex, bool rtnl_held, struct netlink_ext_ack *extack) { const struct Qdisc_class_ops *cops; struct net_device *dev; int err = 0; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) return 0; rcu_read_lock(); /* Find link */ dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { rcu_read_unlock(); return -ENODEV; } /* Find qdisc */ if (!*parent) { *q = rcu_dereference(dev->qdisc); *parent = (*q)->handle; } else { *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); err = -EINVAL; goto errout_rcu; } } *q = qdisc_refcount_inc_nz(*q); if (!*q) { NL_SET_ERR_MSG(extack, "Parent Qdisc doesn't exists"); err = -EINVAL; goto errout_rcu; } /* Is it classful? */ cops = (*q)->ops->cl_ops; if (!cops) { NL_SET_ERR_MSG(extack, "Qdisc not classful"); err = -EINVAL; goto errout_qdisc; } if (!cops->tcf_block) { NL_SET_ERR_MSG(extack, "Class doesn't support blocks"); err = -EOPNOTSUPP; goto errout_qdisc; } errout_rcu: /* At this point we know that qdisc is not noop_qdisc, * which means that qdisc holds a reference to net_device * and we hold a reference to qdisc, so it is safe to release * rcu read lock. */ rcu_read_unlock(); return err; errout_qdisc: rcu_read_unlock(); if (rtnl_held) qdisc_put(*q); else qdisc_put_unlocked(*q); *q = NULL; return err; } static int __tcf_qdisc_cl_find(struct Qdisc *q, u32 parent, unsigned long *cl, int ifindex, struct netlink_ext_ack *extack) { if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) return 0; /* Do we search for filter, attached to class? */ if (TC_H_MIN(parent)) { const struct Qdisc_class_ops *cops = q->ops->cl_ops; *cl = cops->find(q, parent); if (*cl == 0) { NL_SET_ERR_MSG(extack, "Specified class doesn't exist"); return -ENOENT; } } return 0; } static struct tcf_block *__tcf_block_find(struct net *net, struct Qdisc *q, unsigned long cl, int ifindex, u32 block_index, struct netlink_ext_ack *extack) { struct tcf_block *block; if (ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, block_index); if (!block) { NL_SET_ERR_MSG(extack, "Block of given index was not found"); return ERR_PTR(-EINVAL); } } else { const struct Qdisc_class_ops *cops = q->ops->cl_ops; block = cops->tcf_block(q, cl, extack); if (!block) return ERR_PTR(-EINVAL); if (tcf_block_shared(block)) { NL_SET_ERR_MSG(extack, "This filter block is shared. Please use the block index to manipulate the filters"); return ERR_PTR(-EOPNOTSUPP); } /* Always take reference to block in order to support execution * of rules update path of cls API without rtnl lock. Caller * must release block when it is finished using it. 'if' block * of this conditional obtain reference to block by calling * tcf_block_refcnt_get(). */ refcount_inc(&block->refcnt); } return block; } static void __tcf_block_put(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei, bool rtnl_held) { if (refcount_dec_and_mutex_lock(&block->refcnt, &block->lock)) { /* Flushing/putting all chains will cause the block to be * deallocated when last chain is freed. However, if chain_list * is empty, block has to be manually deallocated. After block * reference counter reached 0, it is no longer possible to * increment it or add new chains to block. */ bool free_block = list_empty(&block->chain_list); mutex_unlock(&block->lock); if (tcf_block_shared(block)) tcf_block_remove(block, block->net); if (q) tcf_block_offload_unbind(block, q, ei); if (free_block) tcf_block_destroy(block); else tcf_block_flush_all_chains(block, rtnl_held); } else if (q) { tcf_block_offload_unbind(block, q, ei); } } static void tcf_block_refcnt_put(struct tcf_block *block, bool rtnl_held) { __tcf_block_put(block, NULL, NULL, rtnl_held); } /* Find tcf block. * Set q, parent, cl when appropriate. */ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, u32 *parent, unsigned long *cl, int ifindex, u32 block_index, struct netlink_ext_ack *extack) { struct tcf_block *block; int err = 0; ASSERT_RTNL(); err = __tcf_qdisc_find(net, q, parent, ifindex, true, extack); if (err) goto errout; err = __tcf_qdisc_cl_find(*q, *parent, cl, ifindex, extack); if (err) goto errout_qdisc; block = __tcf_block_find(net, *q, *cl, ifindex, block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout_qdisc; } return block; errout_qdisc: if (*q) qdisc_put(*q); errout: *q = NULL; return ERR_PTR(err); } static void tcf_block_release(struct Qdisc *q, struct tcf_block *block, bool rtnl_held) { if (!IS_ERR_OR_NULL(block)) tcf_block_refcnt_put(block, rtnl_held); if (q) { if (rtnl_held) qdisc_put(q); else qdisc_put_unlocked(q); } } struct tcf_block_owner_item { struct list_head list; struct Qdisc *q; enum flow_block_binder_type binder_type; }; static void tcf_block_owner_netif_keep_dst(struct tcf_block *block, struct Qdisc *q, enum flow_block_binder_type binder_type) { if (block->keep_dst && binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS && binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) netif_keep_dst(qdisc_dev(q)); } void tcf_block_netif_keep_dst(struct tcf_block *block) { struct tcf_block_owner_item *item; block->keep_dst = true; list_for_each_entry(item, &block->owner_list, list) tcf_block_owner_netif_keep_dst(block, item->q, item->binder_type); } EXPORT_SYMBOL(tcf_block_netif_keep_dst); static int tcf_block_owner_add(struct tcf_block *block, struct Qdisc *q, enum flow_block_binder_type binder_type) { struct tcf_block_owner_item *item; item = kmalloc(sizeof(*item), GFP_KERNEL); if (!item) return -ENOMEM; item->q = q; item->binder_type = binder_type; list_add(&item->list, &block->owner_list); return 0; } static void tcf_block_owner_del(struct tcf_block *block, struct Qdisc *q, enum flow_block_binder_type binder_type) { struct tcf_block_owner_item *item; list_for_each_entry(item, &block->owner_list, list) { if (item->q == q && item->binder_type == binder_type) { list_del(&item->list); kfree(item); return; } } WARN_ON(1); } static bool tcf_block_tracks_dev(struct tcf_block *block, struct tcf_block_ext_info *ei) { return tcf_block_shared(block) && (ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS || ei->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS); } int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei, struct netlink_ext_ack *extack) { struct net_device *dev = qdisc_dev(q); struct net *net = qdisc_net(q); struct tcf_block *block = NULL; int err; if (ei->block_index) /* block_index not 0 means the shared block is requested */ block = tcf_block_refcnt_get(net, ei->block_index); if (!block) { block = tcf_block_create(net, q, ei->block_index, extack); if (IS_ERR(block)) return PTR_ERR(block); if (tcf_block_shared(block)) { err = tcf_block_insert(block, net, extack); if (err) goto err_block_insert; } } err = tcf_block_owner_add(block, q, ei->binder_type); if (err) goto err_block_owner_add; tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); err = tcf_chain0_head_change_cb_add(block, ei, extack); if (err) goto err_chain0_head_change_cb_add; err = tcf_block_offload_bind(block, q, ei, extack); if (err) goto err_block_offload_bind; if (tcf_block_tracks_dev(block, ei)) { err = xa_insert(&block->ports, dev->ifindex, dev, GFP_KERNEL); if (err) { NL_SET_ERR_MSG(extack, "block dev insert failed"); goto err_dev_insert; } } *p_block = block; return 0; err_dev_insert: tcf_block_offload_unbind(block, q, ei); err_block_offload_bind: tcf_chain0_head_change_cb_del(block, ei); err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: err_block_insert: tcf_block_refcnt_put(block, true); return err; } EXPORT_SYMBOL(tcf_block_get_ext); static void tcf_chain_head_change_dflt(struct tcf_proto *tp_head, void *priv) { struct tcf_proto __rcu **p_filter_chain = priv; rcu_assign_pointer(*p_filter_chain, tp_head); } int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack) { struct tcf_block_ext_info ei = { .chain_head_change = tcf_chain_head_change_dflt, .chain_head_change_priv = p_filter_chain, }; WARN_ON(!p_filter_chain); return tcf_block_get_ext(p_block, q, &ei, extack); } EXPORT_SYMBOL(tcf_block_get); /* XXX: Standalone actions are not allowed to jump to any chain, and bound * actions should be all removed after flushing. */ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { struct net_device *dev = qdisc_dev(q); if (!block) return; if (tcf_block_tracks_dev(block, ei)) xa_erase(&block->ports, dev->ifindex); tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); __tcf_block_put(block, q, ei, true); } EXPORT_SYMBOL(tcf_block_put_ext); void tcf_block_put(struct tcf_block *block) { struct tcf_block_ext_info ei = {0, }; if (!block) return; tcf_block_put_ext(block, block->q, &ei); } EXPORT_SYMBOL(tcf_block_put); static int tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, void *cb_priv, bool add, bool offload_in_use, struct netlink_ext_ack *extack) { struct tcf_chain *chain, *chain_prev; struct tcf_proto *tp, *tp_prev; int err; lockdep_assert_held(&block->cb_lock); for (chain = __tcf_get_next_chain(block, NULL); chain; chain_prev = chain, chain = __tcf_get_next_chain(block, chain), tcf_chain_put(chain_prev)) { if (chain->tmplt_ops && add) chain->tmplt_ops->tmplt_reoffload(chain, true, cb, cb_priv); for (tp = __tcf_get_next_proto(chain, NULL); tp; tp_prev = tp, tp = __tcf_get_next_proto(chain, tp), tcf_proto_put(tp_prev, true, NULL)) { if (tp->ops->reoffload) { err = tp->ops->reoffload(tp, add, cb, cb_priv, extack); if (err && add) goto err_playback_remove; } else if (add && offload_in_use) { err = -EOPNOTSUPP; NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support"); goto err_playback_remove; } } if (chain->tmplt_ops && !add) chain->tmplt_ops->tmplt_reoffload(chain, false, cb, cb_priv); } return 0; err_playback_remove: tcf_proto_put(tp, true, NULL); tcf_chain_put(chain); tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, extack); return err; } static int tcf_block_bind(struct tcf_block *block, struct flow_block_offload *bo) { struct flow_block_cb *block_cb, *next; int err, i = 0; lockdep_assert_held(&block->cb_lock); list_for_each_entry(block_cb, &bo->cb_list, list) { err = tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, true, tcf_block_offload_in_use(block), bo->extack); if (err) goto err_unroll; if (!bo->unlocked_driver_cb) block->lockeddevcnt++; i++; } list_splice(&bo->cb_list, &block->flow_block.cb_list); return 0; err_unroll: list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->driver_list); if (i-- > 0) { list_del(&block_cb->list); tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, false, tcf_block_offload_in_use(block), NULL); if (!bo->unlocked_driver_cb) block->lockeddevcnt--; } flow_block_cb_free(block_cb); } return err; } static void tcf_block_unbind(struct tcf_block *block, struct flow_block_offload *bo) { struct flow_block_cb *block_cb, *next; lockdep_assert_held(&block->cb_lock); list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, false, tcf_block_offload_in_use(block), NULL); list_del(&block_cb->list); flow_block_cb_free(block_cb); if (!bo->unlocked_driver_cb) block->lockeddevcnt--; } } static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo) { int err; switch (bo->command) { case FLOW_BLOCK_BIND: err = tcf_block_bind(block, bo); break; case FLOW_BLOCK_UNBIND: err = 0; tcf_block_unbind(block, bo); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } /* Main classifier routine: scans classifier chain attached * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. */ static inline int __tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, const struct tcf_proto *orig_tp, struct tcf_result *res, bool compat_mode, struct tcf_exts_miss_cookie_node *n, int act_index, u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT const int max_reclassify_loop = 16; const struct tcf_proto *first_tp; int limit = 0; reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { __be16 protocol = skb_protocol(skb, false); int err = 0; if (n) { struct tcf_exts *exts; if (n->tp_prio != tp->prio) continue; /* We re-lookup the tp and chain based on index instead * of having hard refs and locks to them, so do a sanity * check if any of tp,chain,exts was replaced by the * time we got here with a cookie from hardware. */ if (unlikely(n->tp != tp || n->tp->chain != n->chain || !tp->ops->get_exts)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } exts = tp->ops->get_exts(tp, n->handle); if (unlikely(!exts || n->exts != exts)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } n = NULL; err = tcf_exts_exec_ex(skb, exts, act_index, res); } else { if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; err = tc_classify(skb, tp, res); } #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { first_tp = orig_tp; *last_executed_chain = first_tp->chain->index; goto reset; } else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) { first_tp = res->goto_tp; *last_executed_chain = err & TC_ACT_EXT_VAL_MASK; goto reset; } #endif if (err >= 0) return err; } if (unlikely(n)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= max_reclassify_loop)) { net_notice_ratelimited("%u: reclassify loop, rule prio %u, protocol %02x\n", tp->chain->block->index, tp->prio & 0xffff, ntohs(tp->protocol)); tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); return TC_ACT_SHOT; } tp = first_tp; goto reclassify; #endif } int tcf_classify(struct sk_buff *skb, const struct tcf_block *block, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) u32 last_executed_chain = 0; return __tcf_classify(skb, tp, tp, res, compat_mode, NULL, 0, &last_executed_chain); #else u32 last_executed_chain = tp ? tp->chain->index : 0; struct tcf_exts_miss_cookie_node *n = NULL; const struct tcf_proto *orig_tp = tp; struct tc_skb_ext *ext; int act_index = 0; int ret; if (block) { ext = skb_ext_find(skb, TC_SKB_EXT); if (ext && (ext->chain || ext->act_miss)) { struct tcf_chain *fchain; u32 chain; if (ext->act_miss) { n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie, &act_index); if (!n) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_COOKIE_ERROR); return TC_ACT_SHOT; } chain = n->chain_index; } else { chain = ext->chain; } fchain = tcf_chain_lookup_rcu(block, chain); if (!fchain) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_CHAIN_NOTFOUND); return TC_ACT_SHOT; } /* Consume, so cloned/redirect skbs won't inherit ext */ skb_ext_del(skb, TC_SKB_EXT); tp = rcu_dereference_bh(fchain->filter_chain); last_executed_chain = fchain->index; } } ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, n, act_index, &last_executed_chain); if (tc_skb_ext_tc_enabled()) { /* If we missed on some chain */ if (ret == TC_ACT_UNSPEC && last_executed_chain) { struct tc_skb_cb *cb = tc_skb_cb(skb); ext = tc_skb_ext_alloc(skb); if (WARN_ON_ONCE(!ext)) { tcf_set_drop_reason(skb, SKB_DROP_REASON_NOMEM); return TC_ACT_SHOT; } ext->chain = last_executed_chain; ext->mru = cb->mru; ext->post_ct = cb->post_ct; ext->post_ct_snat = cb->post_ct_snat; ext->post_ct_dnat = cb->post_ct_dnat; ext->zone = cb->zone; } } return ret; #endif } EXPORT_SYMBOL(tcf_classify); struct tcf_chain_info { struct tcf_proto __rcu **pprev; struct tcf_proto __rcu *next; }; static struct tcf_proto *tcf_chain_tp_prev(struct tcf_chain *chain, struct tcf_chain_info *chain_info) { return tcf_chain_dereference(*chain_info->pprev, chain); } static int tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_chain_info *chain_info, struct tcf_proto *tp) { if (chain->flushing) return -EAGAIN; RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain, chain_info)); if (*chain_info->pprev == chain->filter_chain) tcf_chain0_head_change(chain, tp); tcf_proto_get(tp); rcu_assign_pointer(*chain_info->pprev, tp); return 0; } static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_chain_info *chain_info, struct tcf_proto *tp) { struct tcf_proto *next = tcf_chain_dereference(chain_info->next, chain); tcf_proto_mark_delete(tp); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, bool prio_allocate, struct netlink_ext_ack *extack); /* Try to insert new proto. * If proto with specified priority already exists, free new proto * and return existing one. */ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, struct tcf_proto *tp_new, u32 protocol, u32 prio, bool rtnl_held) { struct tcf_chain_info chain_info; struct tcf_proto *tp; int err = 0; mutex_lock(&chain->filter_chain_lock); if (tcf_proto_exists_destroying(chain, tp_new)) { mutex_unlock(&chain->filter_chain_lock); tcf_proto_destroy(tp_new, rtnl_held, false, NULL); return ERR_PTR(-EAGAIN); } tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, NULL); if (!tp) err = tcf_chain_tp_insert(chain, &chain_info, tp_new); mutex_unlock(&chain->filter_chain_lock); if (tp) { tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = tp; } else if (err) { tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = ERR_PTR(err); } return tp_new; } static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcf_chain_info chain_info; struct tcf_proto *tp_iter; struct tcf_proto **pprev; struct tcf_proto *next; mutex_lock(&chain->filter_chain_lock); /* Atomically find and remove tp from chain. */ for (pprev = &chain->filter_chain; (tp_iter = tcf_chain_dereference(*pprev, chain)); pprev = &tp_iter->next) { if (tp_iter == tp) { chain_info.pprev = pprev; chain_info.next = tp_iter->next; WARN_ON(tp_iter->deleting); break; } } /* Verify that tp still exists and no new filters were inserted * concurrently. * Mark tp for deletion if it is empty. */ if (!tp_iter || !tcf_proto_check_delete(tp)) { mutex_unlock(&chain->filter_chain_lock); return; } tcf_proto_signal_destroying(chain, tp); next = tcf_chain_dereference(chain_info.next, chain); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info.pprev, next); mutex_unlock(&chain->filter_chain_lock); tcf_proto_put(tp, rtnl_held, extack); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, bool prio_allocate, struct netlink_ext_ack *extack) { struct tcf_proto **pprev; struct tcf_proto *tp; /* Check the chain for existence of proto-tcf with this priority */ for (pprev = &chain->filter_chain; (tp = tcf_chain_dereference(*pprev, chain)); pprev = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { if (prio_allocate) { NL_SET_ERR_MSG(extack, "Lowest ID from auto-alloc range already in use"); return ERR_PTR(-ENOSPC); } if (tp->protocol != protocol && protocol) { NL_SET_ERR_MSG(extack, "Protocol mismatch for filter with specified priority"); return ERR_PTR(-EINVAL); } } else { tp = NULL; } break; } } chain_info->pprev = pprev; if (tp) { chain_info->next = tp->next; tcf_proto_get(tp); } else { chain_info->next = NULL; } return tp; } static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, bool terse_dump, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); int ret = -EMSGSIZE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; if (q) { tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = parent; } else { tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; tcm->tcm_block_index = block->index; } tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; if (nla_put_u32(skb, TCA_CHAIN, tp->chain->index)) goto nla_put_failure; if (!fh) { tcm->tcm_handle = 0; } else if (terse_dump) { if (tp->ops->terse_dump) { if (tp->ops->terse_dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } else { goto cls_op_not_supp; } } else { if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto nla_put_failure; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; cls_op_not_supp: ret = -EOPNOTSUPP; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return ret; } static struct sk_buff *tfilter_notify_prep(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, int event, u32 portid, bool rtnl_held, struct netlink_ext_ack *extack) { unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE; struct sk_buff *skb; int ret; retry: skb = alloc_skb(size, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, false, rtnl_held, extack); if (ret <= 0) { kfree_skb(skb); if (ret == -EMSGSIZE) { size += NLMSG_GOODSIZE; goto retry; } return ERR_PTR(-EINVAL); } return skb; } static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, int event, bool unicast, bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err = 0; if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event, portid, rtnl_held, extack); if (IS_ERR(skb)) return PTR_ERR(skb); if (unicast) err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); return err; } static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; int err; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return tp->ops->delete(tp, fh, last, rtnl_held, extack); skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, RTM_DELTFILTER, portid, rtnl_held, extack); if (IS_ERR(skb)) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); return PTR_ERR(skb); } err = tp->ops->delete(tp, fh, last, rtnl_held, extack); if (err) { kfree_skb(skb); return err; } err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); return err; } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, struct tcf_chain *chain, int event, struct netlink_ext_ack *extack) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) tfilter_notify(net, oskb, n, tp, block, q, parent, NULL, event, false, true, extack); } static void tfilter_put(struct tcf_proto *tp, void *fh) { if (tp->ops->put && fh) tp->ops->put(tp, fh); } static bool is_qdisc_ingress(__u32 classid) { return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS)); } static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; bool prio_allocate; u32 parent; u32 chain_index; struct Qdisc *q; struct tcf_chain_info chain_info; struct tcf_chain *chain; struct tcf_block *block; struct tcf_proto *tp; unsigned long cl; void *fh; int err; int tp_created; bool rtnl_held = false; u32 flags; replay: tp_created = 0; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); prio_allocate = false; parent = t->tcm_parent; tp = NULL; cl = 0; block = NULL; q = NULL; chain = NULL; flags = 0; if (prio == 0) { /* If no priority is provided by the user, * we allocate one. */ if (n->nlmsg_flags & NLM_F_CREATE) { prio = TC_H_MAKE(0x80000000U, 0U); prio_allocate = true; } else { NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); return -ENOENT; } } /* Find head of filter chain. */ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); if (err) return err; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); err = -EINVAL; goto errout; } /* Take rtnl mutex if rtnl_held was set to true on previous iteration, * block is shared (no qdisc found), qdisc is not unlocked, classifier * type is not specified, classifier is not unlocked. */ if (rtnl_held || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); if (err) goto errout; block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; } block->classid = parent; chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, true); if (!chain) { NL_SET_ERR_MSG(extack, "Cannot create specified filter chain"); err = -ENOMEM; goto errout; } mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, prio_allocate, extack); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout_locked; } if (tp == NULL) { struct tcf_proto *tp_new = NULL; if (chain->flushing) { err = -EAGAIN; goto errout_locked; } /* Proto-tcf does not exist, create new one */ if (tca[TCA_KIND] == NULL || !protocol) { NL_SET_ERR_MSG(extack, "Filter kind and protocol must be specified"); err = -EINVAL; goto errout_locked; } if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout_locked; } if (prio_allocate) prio = tcf_auto_prio(tcf_chain_tp_prev(chain, &chain_info)); mutex_unlock(&chain->filter_chain_lock); tp_new = tcf_proto_create(name, protocol, prio, chain, rtnl_held, extack); if (IS_ERR(tp_new)) { err = PTR_ERR(tp_new); goto errout_tp; } tp_created = 1; tp = tcf_chain_tp_insert_unique(chain, tp_new, protocol, prio, rtnl_held); if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout_tp; } } else { mutex_unlock(&chain->filter_chain_lock); } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout; } fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWTFILTER and NLM_F_CREATE to create a new filter"); err = -ENOENT; goto errout; } } else if (n->nlmsg_flags & NLM_F_EXCL) { tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Filter already exists"); err = -EEXIST; goto errout; } if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); err = -EINVAL; goto errout; } if (!(n->nlmsg_flags & NLM_F_CREATE)) flags |= TCA_ACT_FLAGS_REPLACE; if (!rtnl_held) flags |= TCA_ACT_FLAGS_NO_RTNL; if (is_qdisc_ingress(parent)) flags |= TCA_ACT_FLAGS_AT_INGRESS; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); tcf_proto_count_usesw(tp, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; } errout: if (err && tp_created) tcf_chain_tp_delete_empty(chain, tp, rtnl_held, NULL); errout_tp: if (chain) { if (tp && !IS_ERR(tp)) tcf_proto_put(tp, rtnl_held, NULL); if (!tp_created) tcf_chain_put(chain); } tcf_block_release(q, block, rtnl_held); if (rtnl_held) rtnl_unlock(); if (err == -EAGAIN) { /* Take rtnl lock in case EAGAIN is caused by concurrent flush * of target chain. */ rtnl_held = true; /* Replay the request. */ goto replay; } return err; errout_locked: mutex_unlock(&chain->filter_chain_lock); goto errout; } static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; u32 parent; u32 chain_index; struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; struct tcf_block *block = NULL; struct tcf_proto *tp = NULL; unsigned long cl = 0; void *fh = NULL; int err; bool rtnl_held = false; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); parent = t->tcm_parent; if (prio == 0 && (protocol || t->tcm_handle || tca[TCA_KIND])) { NL_SET_ERR_MSG(extack, "Cannot flush filters with protocol, handle or kind set"); return -ENOENT; } /* Find head of filter chain. */ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); if (err) return err; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); err = -EINVAL; goto errout; } /* Take rtnl mutex if flushing whole chain, block is shared (no qdisc * found), qdisc is not unlocked, classifier type is not specified, * classifier is not unlocked. */ if (!prio || (q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); if (err) goto errout; block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; } chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, false); if (!chain) { /* User requested flush on non-existent chain. Nothing to do, * so just return success. */ if (prio == 0) { err = 0; goto errout; } NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -ENOENT; goto errout; } if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, chain, RTM_DELTFILTER, extack); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; } mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, extack); if (!tp) { err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); goto errout_locked; } else if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout_locked; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout_locked; } else if (t->tcm_handle == 0) { tcf_proto_signal_destroying(chain, tp); tcf_chain_tp_remove(chain, &chain_info, tp); mutex_unlock(&chain->filter_chain_lock); tcf_proto_put(tp, rtnl_held, NULL); tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_DELTFILTER, false, rtnl_held, extack); err = 0; goto errout; } mutex_unlock(&chain->filter_chain_lock); fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { NL_SET_ERR_MSG(extack, "Specified filter handle not found"); err = -ENOENT; } else { bool last; err = tfilter_del_notify(net, skb, n, tp, block, q, parent, fh, &last, rtnl_held, extack); if (err) goto errout; if (last) tcf_chain_tp_delete_empty(chain, tp, rtnl_held, extack); } errout: if (chain) { if (tp && !IS_ERR(tp)) tcf_proto_put(tp, rtnl_held, NULL); tcf_chain_put(chain); } tcf_block_release(q, block, rtnl_held); if (rtnl_held) rtnl_unlock(); return err; errout_locked: mutex_unlock(&chain->filter_chain_lock); goto errout; } static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; char name[IFNAMSIZ]; struct tcmsg *t; u32 protocol; u32 prio; u32 parent; u32 chain_index; struct Qdisc *q = NULL; struct tcf_chain_info chain_info; struct tcf_chain *chain = NULL; struct tcf_block *block = NULL; struct tcf_proto *tp = NULL; unsigned long cl = 0; void *fh = NULL; int err; bool rtnl_held = false; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); protocol = TC_H_MIN(t->tcm_info); prio = TC_H_MAJ(t->tcm_info); parent = t->tcm_parent; if (prio == 0) { NL_SET_ERR_MSG(extack, "Invalid filter command with priority of zero"); return -ENOENT; } /* Find head of filter chain. */ err = __tcf_qdisc_find(net, &q, &parent, t->tcm_ifindex, false, extack); if (err) return err; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC filter name too long"); err = -EINVAL; goto errout; } /* Take rtnl mutex if block is shared (no qdisc found), qdisc is not * unlocked, classifier type is not specified, classifier is not * unlocked. */ if ((q && !(q->ops->cl_ops->flags & QDISC_CLASS_OPS_DOIT_UNLOCKED)) || !tcf_proto_is_unlocked(name)) { rtnl_held = true; rtnl_lock(); } err = __tcf_qdisc_cl_find(q, parent, &cl, t->tcm_ifindex, extack); if (err) goto errout; block = __tcf_block_find(net, q, cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) { err = PTR_ERR(block); goto errout; } chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout; } chain = tcf_chain_get(block, chain_index, false); if (!chain) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -EINVAL; goto errout; } mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, extack); mutex_unlock(&chain->filter_chain_lock); if (!tp) { err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); goto errout; } else if (IS_ERR(tp)) { err = PTR_ERR(tp); goto errout; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); err = -EINVAL; goto errout; } fh = tp->ops->get(tp, t->tcm_handle); if (!fh) { NL_SET_ERR_MSG(extack, "Specified filter handle not found"); err = -ENOENT; } else { err = tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, true, rtnl_held, NULL); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); } tfilter_put(tp, fh); errout: if (chain) { if (tp && !IS_ERR(tp)) tcf_proto_put(tp, rtnl_held, NULL); tcf_chain_put(chain); } tcf_block_release(q, block, rtnl_held); if (rtnl_held) rtnl_unlock(); return err; } struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; struct tcf_block *block; struct Qdisc *q; u32 parent; bool terse_dump; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) { struct tcf_dump_args *a = (void *)arg; struct net *net = sock_net(a->skb->sk); return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER, a->terse_dump, true, NULL); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, struct sk_buff *skb, struct netlink_callback *cb, long index_start, long *p_index, bool terse) { struct net *net = sock_net(skb->sk); struct tcf_block *block = chain->block; struct tcmsg *tcm = nlmsg_data(cb->nlh); struct tcf_proto *tp, *tp_prev; struct tcf_dump_args arg; for (tp = __tcf_get_next_proto(chain, NULL); tp; tp_prev = tp, tp = __tcf_get_next_proto(chain, tp), tcf_proto_put(tp_prev, true, NULL), (*p_index)++) { if (*p_index < index_start) continue; if (TC_H_MAJ(tcm->tcm_info) && TC_H_MAJ(tcm->tcm_info) != tp->prio) continue; if (TC_H_MIN(tcm->tcm_info) && TC_H_MIN(tcm->tcm_info) != tp->protocol) continue; if (*p_index > index_start) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER, false, true, NULL) <= 0) goto errout; cb->args[1] = 1; } if (!tp->ops->walk) continue; arg.w.fn = tcf_node_dump; arg.skb = skb; arg.cb = cb; arg.block = block; arg.q = q; arg.parent = parent; arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; arg.w.cookie = cb->args[2]; arg.terse_dump = terse; tp->ops->walk(tp, &arg.w, true); cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) goto errout; } return true; errout: tcf_proto_put(tp, true, NULL); return false; } static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = { [TCA_CHAIN] = { .type = NLA_U32 }, [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE), }; /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { struct tcf_chain *chain, *chain_prev; struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); bool terse_dump = false; long index_start; long index; u32 parent; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, tcf_tfilter_dump_policy, cb->extack); if (err) return err; if (tca[TCA_DUMP_FLAGS]) { struct nla_bitfield32 flags = nla_get_bitfield32(tca[TCA_DUMP_FLAGS]); terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE; } if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; /* If we work with block index, q is NULL and parent value * will never be used in the following code. The check * in tcf_fill_node prevents it. However, compiler does not * see that far, so set parent to zero to silence the warning * about parent being uninitialized. */ parent = 0; } else { const struct Qdisc_class_ops *cops; struct net_device *dev; unsigned long cl = 0; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return skb->len; parent = tcm->tcm_parent; if (!parent) q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) goto out; cops = q->ops->cl_ops; if (!cops) goto out; if (!cops->tcf_block) goto out; if (TC_H_MIN(tcm->tcm_parent)) { cl = cops->find(q, tcm->tcm_parent); if (cl == 0) goto out; } block = cops->tcf_block(q, cl, NULL); if (!block) goto out; parent = block->classid; if (tcf_block_shared(block)) q = NULL; } index_start = cb->args[0]; index = 0; for (chain = __tcf_get_next_chain(block, NULL); chain; chain_prev = chain, chain = __tcf_get_next_chain(block, chain), tcf_chain_put(chain_prev)) { if (tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, index_start, &index, terse_dump)) { tcf_chain_put(chain); err = -EMSGSIZE; break; } } if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) tcf_block_refcnt_put(block, true); cb->args[0] = index; out: /* If we did no progress, the error (EMSGSIZE) is real */ if (skb->len == 0 && err) return err; return skb->len; } static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct net *net, struct sk_buff *skb, struct tcf_block *block, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { unsigned char *b = skb_tail_pointer(skb); const struct tcf_proto_ops *ops; struct nlmsghdr *nlh; struct tcmsg *tcm; void *priv; ops = tmplt_ops; priv = tmplt_priv; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_handle = 0; if (block->q) { tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; tcm->tcm_parent = block->q->handle; } else { tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; tcm->tcm_block_index = block->index; } if (nla_put_u32(skb, TCA_CHAIN, chain_index)) goto nla_put_failure; if (ops) { if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; if (ops->tmplt_dump(skb, net, priv) < 0) goto nla_put_failure; } if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -EMSGSIZE; } static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, u32 seq, u16 flags, int event, bool unicast, struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct tcf_block *block = chain->block; struct net *net = block->net; struct sk_buff *skb; int err = 0; if (!unicast && !rtnl_notify_needed(net, flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, portid, seq, flags, event, extack) <= 0) { kfree_skb(skb); return -EINVAL; } if (unicast) err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); return err; } static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct tcf_block *block, struct sk_buff *oskb, u32 seq, u16 flags) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct net *net = block->net; struct sk_buff *skb; if (!rtnl_notify_needed(net, flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb, block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, struct nlattr **tca, struct netlink_ext_ack *extack) { const struct tcf_proto_ops *ops; char name[IFNAMSIZ]; void *tmplt_priv; /* If kind is not set, user did not specify template. */ if (!tca[TCA_KIND]) return 0; if (tcf_proto_check_kind(tca[TCA_KIND], name)) { NL_SET_ERR_MSG(extack, "Specified TC chain template name too long"); return -EINVAL; } ops = tcf_proto_lookup_ops(name, true, extack); if (IS_ERR(ops)) return PTR_ERR(ops); if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump || !ops->tmplt_reoffload) { NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); module_put(ops->owner); return -EOPNOTSUPP; } tmplt_priv = ops->tmplt_create(net, chain, tca, extack); if (IS_ERR(tmplt_priv)) { module_put(ops->owner); return PTR_ERR(tmplt_priv); } chain->tmplt_ops = ops; chain->tmplt_priv = tmplt_priv; return 0; } static void tc_chain_tmplt_del(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv) { /* If template ops are set, no work to do for us. */ if (!tmplt_ops) return; tmplt_ops->tmplt_destroy(tmplt_priv); module_put(tmplt_ops->owner); } /* Add/delete/get a chain */ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct tcmsg *t; u32 parent; u32 chain_index; struct Qdisc *q; struct tcf_chain *chain; struct tcf_block *block; unsigned long cl; int err; replay: q = NULL; err = nlmsg_parse_deprecated(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; t = nlmsg_data(n); parent = t->tcm_parent; cl = 0; block = tcf_block_find(net, &q, &parent, &cl, t->tcm_ifindex, t->tcm_block_index, extack); if (IS_ERR(block)) return PTR_ERR(block); chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; goto errout_block; } mutex_lock(&block->lock); chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { if (tcf_chain_held_by_acts_only(chain)) { /* The chain exists only because there is * some action referencing it. */ tcf_chain_hold(chain); } else { NL_SET_ERR_MSG(extack, "Filter chain already exists"); err = -EEXIST; goto errout_block_locked; } } else { if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); err = -ENOENT; goto errout_block_locked; } chain = tcf_chain_create(block, chain_index); if (!chain) { NL_SET_ERR_MSG(extack, "Failed to create filter chain"); err = -ENOMEM; goto errout_block_locked; } } } else { if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -EINVAL; goto errout_block_locked; } tcf_chain_hold(chain); } if (n->nlmsg_type == RTM_NEWCHAIN) { /* Modifying chain requires holding parent block lock. In case * the chain was successfully added, take a reference to the * chain. This ensures that an empty chain does not disappear at * the end of this function. */ tcf_chain_hold(chain); chain->explicitly_created = true; } mutex_unlock(&block->lock); switch (n->nlmsg_type) { case RTM_NEWCHAIN: err = tc_chain_tmplt_add(chain, net, tca, extack); if (err) { tcf_chain_put_explicitly_created(chain); goto errout; } tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, RTM_NEWCHAIN, false, extack); break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, chain, RTM_DELTFILTER, extack); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference * to the chain previously taken during addition. */ tcf_chain_put_explicitly_created(chain); break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, n->nlmsg_flags, n->nlmsg_type, true, extack); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); break; default: err = -EOPNOTSUPP; NL_SET_ERR_MSG(extack, "Unsupported message type"); goto errout; } errout: tcf_chain_put(chain); errout_block: tcf_block_release(q, block, true); if (err == -EAGAIN) /* Replay the request. */ goto replay; return err; errout_block_locked: mutex_unlock(&block->lock); goto errout_block; } /* called with RTNL */ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); struct tcf_chain *chain; long index_start; long index; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, cb->extack); if (err) return err; if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) goto out; } else { const struct Qdisc_class_ops *cops; struct net_device *dev; unsigned long cl = 0; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return skb->len; if (!tcm->tcm_parent) q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) goto out; cops = q->ops->cl_ops; if (!cops) goto out; if (!cops->tcf_block) goto out; if (TC_H_MIN(tcm->tcm_parent)) { cl = cops->find(q, tcm->tcm_parent); if (cl == 0) goto out; } block = cops->tcf_block(q, cl, NULL); if (!block) goto out; if (tcf_block_shared(block)) q = NULL; } index_start = cb->args[0]; index = 0; mutex_lock(&block->lock); list_for_each_entry(chain, &block->chain_list, list) { if ((tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index)) continue; if (index < index_start) { index++; continue; } if (tcf_chain_held_by_acts_only(chain)) continue; err = tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWCHAIN, NULL); if (err <= 0) break; index++; } mutex_unlock(&block->lock); if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) tcf_block_refcnt_put(block, true); cb->args[0] = index; out: /* If we did no progress, the error (EMSGSIZE) is real */ if (skb->len == 0 && err) return err; return skb->len; } int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, int police, struct tcf_proto *tp, u32 handle, bool use_action_miss) { int err = 0; #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; exts->miss_cookie_node = NULL; /* Note: we do not own yet a reference on net. * This reference might be taken later from tcf_exts_get_net(). */ exts->net = net; exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), GFP_KERNEL); if (!exts->actions) return -ENOMEM; #endif exts->action = action; exts->police = police; if (!use_action_miss) return 0; err = tcf_exts_miss_cookie_base_alloc(exts, tp, handle); if (err) goto err_miss_alloc; return 0; err_miss_alloc: tcf_exts_destroy(exts); #ifdef CONFIG_NET_CLS_ACT exts->actions = NULL; #endif return err; } EXPORT_SYMBOL(tcf_exts_init_ex); void tcf_exts_destroy(struct tcf_exts *exts) { tcf_exts_miss_cookie_base_destroy(exts); #ifdef CONFIG_NET_CLS_ACT if (exts->actions) { tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); } exts->nr_actions = 0; #endif } EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate_ex(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT { int init_res[TCA_ACT_MAX_PRIO] = {}; struct tc_action *act; size_t attr_size = 0; if (exts->police && tb[exts->police]) { struct tc_action_ops *a_o; flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND; a_o = tc_action_load_ops(tb[exts->police], flags, extack); if (IS_ERR(a_o)) return PTR_ERR(a_o); act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, a_o, init_res, flags, extack); module_put(a_o->owner); if (IS_ERR(act)) return PTR_ERR(act); act->type = exts->type = TCA_OLD_COMPAT; exts->actions[0] = act; exts->nr_actions = 1; tcf_idr_insert_many(exts->actions, init_res); } else if (exts->action && tb[exts->action]) { int err; flags |= TCA_ACT_FLAGS_BIND; err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, exts->actions, init_res, &attr_size, flags, fl_flags, extack); if (err < 0) return err; exts->nr_actions = err; } } #else if ((exts->action && tb[exts->action]) || (exts->police && tb[exts->police])) { NL_SET_ERR_MSG(extack, "Classifier actions are not supported per compile options (CONFIG_NET_CLS_ACT)"); return -EOPNOTSUPP; } #endif return 0; } EXPORT_SYMBOL(tcf_exts_validate_ex); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, struct tcf_exts *exts, u32 flags, struct netlink_ext_ack *extack) { return tcf_exts_validate_ex(net, tp, tb, rate_tlv, exts, flags, 0, extack); } EXPORT_SYMBOL(tcf_exts_validate); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src) { #ifdef CONFIG_NET_CLS_ACT struct tcf_exts old = *dst; *dst = *src; tcf_exts_destroy(&old); #endif } EXPORT_SYMBOL(tcf_exts_change); #ifdef CONFIG_NET_CLS_ACT static struct tc_action *tcf_exts_first_act(struct tcf_exts *exts) { if (exts->nr_actions == 0) return NULL; else return exts->actions[0]; } #endif int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct nlattr *nest; if (exts->action && tcf_exts_has_actions(exts)) { /* * again for backward compatible mode - we want * to work with both old and new modes of entering * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { nest = nla_nest_start_noflag(skb, exts->action); if (nest == NULL) goto nla_put_failure; if (tcf_action_dump(skb, exts->actions, 0, 0, false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { struct tc_action *act = tcf_exts_first_act(exts); nest = nla_nest_start_noflag(skb, exts->police); if (nest == NULL || !act) goto nla_put_failure; if (tcf_action_dump_old(skb, act, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } } return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; #else return 0; #endif } EXPORT_SYMBOL(tcf_exts_dump); int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct nlattr *nest; if (!exts->action || !tcf_exts_has_actions(exts)) return 0; nest = nla_nest_start_noflag(skb, exts->action); if (!nest) goto nla_put_failure; if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; #else return 0; #endif } EXPORT_SYMBOL(tcf_exts_terse_dump); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct tc_action *a = tcf_exts_first_act(exts); if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0) return -1; #endif return 0; } EXPORT_SYMBOL(tcf_exts_dump_stats); static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) { if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; atomic_inc(&block->offloadcnt); } static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) { if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; atomic_dec(&block->offloadcnt); } static void tc_cls_offload_cnt_update(struct tcf_block *block, struct tcf_proto *tp, u32 *cnt, u32 *flags, u32 diff, bool add) { lockdep_assert_held(&block->cb_lock); spin_lock(&tp->lock); if (add) { if (!*cnt) tcf_block_offload_inc(block, flags); *cnt += diff; } else { *cnt -= diff; if (!*cnt) tcf_block_offload_dec(block, flags); } spin_unlock(&tp->lock); } static void tc_cls_offload_cnt_reset(struct tcf_block *block, struct tcf_proto *tp, u32 *cnt, u32 *flags) { lockdep_assert_held(&block->cb_lock); spin_lock(&tp->lock); tcf_block_offload_dec(block, flags); *cnt = 0; spin_unlock(&tp->lock); } static int __tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop) { struct flow_block_cb *block_cb; int ok_count = 0; int err; list_for_each_entry(block_cb, &block->flow_block.cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err) { if (err_stop) return err; } else { ok_count++; } } return ok_count; } int tc_setup_cb_call(struct tcf_block *block, enum tc_setup_type type, void *type_data, bool err_stop, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return ok_count; } EXPORT_SYMBOL(tc_setup_cb_call); /* Non-destructive filter add. If filter that wasn't already in hardware is * successfully offloaded, increment block offloads counter. On failure, * previously offloaded filter is considered to be intact and offloads counter * is not decremented. */ int tc_setup_cb_add(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } /* Make sure all netdevs sharing this block are offload-capable. */ if (block->nooffloaddevcnt && err_stop) { ok_count = -EOPNOTSUPP; goto err_unlock; } ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); if (ok_count < 0) goto err_unlock; if (tp->ops->hw_add) tp->ops->hw_add(tp, type_data); if (ok_count > 0) tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, ok_count, true); err_unlock: up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return min(ok_count, 0); } EXPORT_SYMBOL(tc_setup_cb_add); /* Destructive filter replace. If filter that wasn't already in hardware is * successfully offloaded, increment block offload counter. On failure, * previously offloaded filter is considered to be destroyed and offload counter * is decremented. */ int tc_setup_cb_replace(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *old_flags, unsigned int *old_in_hw_count, u32 *new_flags, unsigned int *new_in_hw_count, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } /* Make sure all netdevs sharing this block are offload-capable. */ if (block->nooffloaddevcnt && err_stop) { ok_count = -EOPNOTSUPP; goto err_unlock; } tc_cls_offload_cnt_reset(block, tp, old_in_hw_count, old_flags); if (tp->ops->hw_del) tp->ops->hw_del(tp, type_data); ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); if (ok_count < 0) goto err_unlock; if (tp->ops->hw_add) tp->ops->hw_add(tp, type_data); if (ok_count > 0) tc_cls_offload_cnt_update(block, tp, new_in_hw_count, new_flags, ok_count, true); err_unlock: up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return min(ok_count, 0); } EXPORT_SYMBOL(tc_setup_cb_replace); /* Destroy filter and decrement block offload counter, if filter was previously * offloaded. */ int tc_setup_cb_destroy(struct tcf_block *block, struct tcf_proto *tp, enum tc_setup_type type, void *type_data, bool err_stop, u32 *flags, unsigned int *in_hw_count, bool rtnl_held) { bool take_rtnl = READ_ONCE(block->lockeddevcnt) && !rtnl_held; int ok_count; retry: if (take_rtnl) rtnl_lock(); down_read(&block->cb_lock); /* Need to obtain rtnl lock if block is bound to devs that require it. * In block bind code cb_lock is obtained while holding rtnl, so we must * obtain the locks in same order here. */ if (!rtnl_held && !take_rtnl && block->lockeddevcnt) { up_read(&block->cb_lock); take_rtnl = true; goto retry; } ok_count = __tc_setup_cb_call(block, type, type_data, err_stop); tc_cls_offload_cnt_reset(block, tp, in_hw_count, flags); if (tp->ops->hw_del) tp->ops->hw_del(tp, type_data); up_read(&block->cb_lock); if (take_rtnl) rtnl_unlock(); return min(ok_count, 0); } EXPORT_SYMBOL(tc_setup_cb_destroy); int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, enum tc_setup_type type, void *type_data, void *cb_priv, u32 *flags, unsigned int *in_hw_count) { int err = cb(type, type_data, cb_priv); if (err) { if (add && tc_skip_sw(*flags)) return err; } else { tc_cls_offload_cnt_update(block, tp, in_hw_count, flags, 1, add); } return 0; } EXPORT_SYMBOL(tc_setup_cb_reoffload); static int tcf_act_get_user_cookie(struct flow_action_entry *entry, const struct tc_action *act) { struct tc_cookie *user_cookie; int err = 0; rcu_read_lock(); user_cookie = rcu_dereference(act->user_cookie); if (user_cookie) { entry->user_cookie = flow_action_cookie_create(user_cookie->data, user_cookie->len, GFP_ATOMIC); if (!entry->user_cookie) err = -ENOMEM; } rcu_read_unlock(); return err; } static void tcf_act_put_user_cookie(struct flow_action_entry *entry) { flow_action_cookie_destroy(entry->user_cookie); } void tc_cleanup_offload_action(struct flow_action *flow_action) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, flow_action) { tcf_act_put_user_cookie(entry); if (entry->destructor) entry->destructor(entry->destructor_priv); } } EXPORT_SYMBOL(tc_cleanup_offload_action); static int tc_setup_offload_act(struct tc_action *act, struct flow_action_entry *entry, u32 *index_inc, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT if (act->ops->offload_act_setup) { return act->ops->offload_act_setup(act, entry, index_inc, true, extack); } else { NL_SET_ERR_MSG(extack, "Action does not support offload"); return -EOPNOTSUPP; } #else return 0; #endif } int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], u32 miss_cookie_base, struct netlink_ext_ack *extack) { int i, j, k, index, err = 0; struct tc_action *act; BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY); BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE); BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED); if (!actions) return 0; j = 0; tcf_act_for_each_action(i, act, actions) { struct flow_action_entry *entry; entry = &flow_action->entries[j]; spin_lock_bh(&act->tcfa_lock); err = tcf_act_get_user_cookie(entry, act); if (err) goto err_out_locked; index = 0; err = tc_setup_offload_act(act, entry, &index, extack); if (err) goto err_out_locked; for (k = 0; k < index ; k++) { entry[k].hw_stats = tc_act_hw_stats(act->hw_stats); entry[k].hw_index = act->tcfa_index; entry[k].cookie = (unsigned long)act; entry[k].miss_cookie = tcf_exts_miss_cookie_get(miss_cookie_base, i); } j += index; spin_unlock_bh(&act->tcfa_lock); } err_out: if (err) tc_cleanup_offload_action(flow_action); return err; err_out_locked: spin_unlock_bh(&act->tcfa_lock); goto err_out; } int tc_setup_offload_action(struct flow_action *flow_action, const struct tcf_exts *exts, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT u32 miss_cookie_base; if (!exts) return 0; miss_cookie_base = exts->miss_cookie_node ? exts->miss_cookie_node->miss_cookie_base : 0; return tc_setup_action(flow_action, exts->actions, miss_cookie_base, extack); #else return 0; #endif } EXPORT_SYMBOL(tc_setup_offload_action); unsigned int tcf_exts_num_actions(struct tcf_exts *exts) { unsigned int num_acts = 0; struct tc_action *act; int i; tcf_exts_for_each_action(i, act, exts) { if (is_tcf_pedit(act)) num_acts += tcf_pedit_nkeys(act); else num_acts++; } return num_acts; } EXPORT_SYMBOL(tcf_exts_num_actions); #ifdef CONFIG_NET_CLS_ACT static int tcf_qevent_parse_block_index(struct nlattr *block_index_attr, u32 *p_block_index, struct netlink_ext_ack *extack) { *p_block_index = nla_get_u32(block_index_attr); if (!*p_block_index) { NL_SET_ERR_MSG(extack, "Block number may not be zero"); return -EINVAL; } return 0; } int tcf_qevent_init(struct tcf_qevent *qe, struct Qdisc *sch, enum flow_block_binder_type binder_type, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { u32 block_index; int err; if (!block_index_attr) return 0; err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); if (err) return err; qe->info.binder_type = binder_type; qe->info.chain_head_change = tcf_chain_head_change_dflt; qe->info.chain_head_change_priv = &qe->filter_chain; qe->info.block_index = block_index; return tcf_block_get_ext(&qe->block, sch, &qe->info, extack); } EXPORT_SYMBOL(tcf_qevent_init); void tcf_qevent_destroy(struct tcf_qevent *qe, struct Qdisc *sch) { if (qe->info.block_index) tcf_block_put_ext(qe->block, sch, &qe->info); } EXPORT_SYMBOL(tcf_qevent_destroy); int tcf_qevent_validate_change(struct tcf_qevent *qe, struct nlattr *block_index_attr, struct netlink_ext_ack *extack) { u32 block_index; int err; if (!block_index_attr) return 0; err = tcf_qevent_parse_block_index(block_index_attr, &block_index, extack); if (err) return err; /* Bounce newly-configured block or change in block. */ if (block_index != qe->info.block_index) { NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); return -EINVAL; } return 0; } EXPORT_SYMBOL(tcf_qevent_validate_change); struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, struct sk_buff *skb, struct sk_buff **to_free, int *ret) { struct tcf_result cl_res; struct tcf_proto *fl; if (!qe->info.block_index) return skb; fl = rcu_dereference_bh(qe->filter_chain); switch (tcf_classify(skb, NULL, fl, &cl_res, false)) { case TC_ACT_SHOT: qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); *ret = __NET_XMIT_BYPASS; return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: __qdisc_drop(skb, to_free); *ret = __NET_XMIT_STOLEN; return NULL; case TC_ACT_REDIRECT: skb_do_redirect(skb); *ret = __NET_XMIT_STOLEN; return NULL; } return skb; } EXPORT_SYMBOL(tcf_qevent_handle); int tcf_qevent_dump(struct sk_buff *skb, int attr_name, struct tcf_qevent *qe) { if (!qe->info.block_index) return 0; return nla_put_u32(skb, attr_name, qe->info.block_index); } EXPORT_SYMBOL(tcf_qevent_dump); #endif static __net_init int tcf_net_init(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); spin_lock_init(&tn->idr_lock); idr_init(&tn->idr); return 0; } static void __net_exit tcf_net_exit(struct net *net) { struct tcf_net *tn = net_generic(net, tcf_net_id); idr_destroy(&tn->idr); } static struct pernet_operations tcf_net_ops = { .init = tcf_net_init, .exit = tcf_net_exit, .id = &tcf_net_id, .size = sizeof(struct tcf_net), }; static const struct rtnl_msg_handler tc_filter_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWTFILTER, .doit = tc_new_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_DELTFILTER, .doit = tc_del_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_GETTFILTER, .doit = tc_get_tfilter, .dumpit = tc_dump_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.msgtype = RTM_NEWCHAIN, .doit = tc_ctl_chain}, {.msgtype = RTM_DELCHAIN, .doit = tc_ctl_chain}, {.msgtype = RTM_GETCHAIN, .doit = tc_ctl_chain, .dumpit = tc_dump_chain}, }; static int __init tc_filter_init(void) { int err; tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0); if (!tc_filter_wq) return -ENOMEM; err = register_pernet_subsys(&tcf_net_ops); if (err) goto err_register_pernet_subsys; xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1); rtnl_register_many(tc_filter_rtnl_msg_handlers); return 0; err_register_pernet_subsys: destroy_workqueue(tc_filter_wq); return err; } subsys_initcall(tc_filter_init); |
| 418 366 683 648 683 699 919 893 311 311 1240 683 683 286 1 30 193 13 353 249 9 7 42 287 286 253 285 32 250 274 274 242 242 7 132 2 1 1 3 10 29 11 11 11 92 92 1 16 314 41 41 41 41 44 8552 89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAPOPS_H #define _LINUX_SWAPOPS_H #include <linux/radix-tree.h> #include <linux/bug.h> #include <linux/mm_types.h> #ifdef CONFIG_MMU #ifdef CONFIG_SWAP #include <linux/swapfile.h> #endif /* CONFIG_SWAP */ /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in * the low-order bits. * * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ #ifdef MAX_PHYSMEM_BITS #define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) #else /* MAX_PHYSMEM_BITS */ #define SWP_PFN_BITS min_t(int, \ sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \ SWP_TYPE_SHIFT) #endif /* MAX_PHYSMEM_BITS */ #define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) /** * Migration swap entry specific bitfield definitions. Layout: * * |----------+--------------------| * | swp_type | swp_offset | * |----------+--------+-+-+-------| * | | resv |D|A| PFN | * |----------+--------+-+-+-------| * * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A) * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D) * * Note: A/D bits will be stored in migration entries iff there're enough * free bits in arch specific swp offset. By default we'll ignore A/D bits * when migrating a page. Please refer to migration_entry_supports_ad() * for more information. If there're more bits besides PFN and A/D bits, * they should be reserved and always be zeros. */ #define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS) #define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1) #define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2) #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) static inline bool is_pfn_swap_entry(swp_entry_t entry); /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { if (pte_swp_exclusive(pte)) pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) pte = pte_swp_clear_uffd_wp(pte); return pte; } /* * Store a type+offset into a swp_entry_t in an arch-independent format */ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) { swp_entry_t ret; ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK); return ret; } /* * Extract the `type' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline unsigned swp_type(swp_entry_t entry) { return (entry.val >> SWP_TYPE_SHIFT); } /* * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline pgoff_t swp_offset(swp_entry_t entry) { return entry.val & SWP_OFFSET_MASK; } /* * This should only be called upon a pfn swap entry to get the PFN stored * in the swap entry. Please refers to is_pfn_swap_entry() for definition * of pfn swap entry. */ static inline unsigned long swp_offset_pfn(swp_entry_t entry) { VM_BUG_ON(!is_pfn_swap_entry(entry)); return swp_offset(entry) & SWP_PFN_MASK; } /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. */ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; pte = pte_swp_clear_flags(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. */ static inline pte_t swp_entry_to_pte(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pte(arch_entry); } static inline swp_entry_t radix_to_swp_entry(void *arg) { swp_entry_t entry; entry.val = xa_to_value(arg); return entry; } static inline void *swp_to_radix_entry(swp_entry_t entry) { return xa_mk_value(entry.val); } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_READ, offset); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_WRITE, offset); } static inline bool is_device_private_entry(swp_entry_t entry) { int type = swp_type(entry); return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; } #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_private_entry(swp_entry_t entry) { return false; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return false; } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return false; } #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION static inline int is_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ || swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_writable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_readable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ); } static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); } static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ, offset); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); } /* * Returns whether the host has large enough swap offset field to support * carrying over pgtable A/D bits for page migrations. The result is * pretty much arch specific. */ static inline bool migration_entry_supports_ad(void) { #ifdef CONFIG_SWAP return swap_migration_ad_supported; #else /* CONFIG_SWAP */ return false; #endif /* CONFIG_SWAP */ } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_YOUNG); return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_YOUNG; /* Keep the old behavior of aging page after migration */ return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_DIRTY); return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_DIRTY; /* Keep the old behavior of clean page after migration */ return false; } extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline int is_migration_entry(swp_entry_t swp) { return 0; } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; } static inline int is_readable_migration_entry(swp_entry_t entry) { return 0; } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { return false; } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE /* * Support for hardware poisoned pages */ static inline swp_entry_t make_hwpoison_entry(struct page *page) { BUG_ON(!PageLocked(page)); return swp_entry(SWP_HWPOISON, page_to_pfn(page)); } static inline int is_hwpoison_entry(swp_entry_t entry) { return swp_type(entry) == SWP_HWPOISON; } #else static inline swp_entry_t make_hwpoison_entry(struct page *page) { return swp_entry(0, 0); } static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } #endif typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) /* * "Poisoned" here is meant in the very general sense of "future accesses are * invalid", instead of referring very specifically to hardware memory errors. * This marker is meant to represent any of various different causes of this. * * Note that, when encountered by the faulting logic, PTEs with this marker will * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error * logic. */ #define PTE_MARKER_POISONED BIT(1) /* * Indicates that, on fault, this PTE will case a SIGSEGV signal to be * sent. This means guard markers behave in effect as if the region were mapped * PROT_NONE, rather than if they were a memory hole or equivalent. */ #define PTE_MARKER_GUARD BIT(2) #define PTE_MARKER_MASK (BIT(3) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); } static inline bool is_pte_marker_entry(swp_entry_t entry) { return swp_type(entry) == SWP_PTE_MARKER; } static inline pte_marker pte_marker_get(swp_entry_t entry) { return swp_offset(entry) & PTE_MARKER_MASK; } static inline bool is_pte_marker(pte_t pte) { return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); } static inline swp_entry_t make_poisoned_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_POISONED); } static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_POISONED); } static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } static inline int is_guard_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_GUARD); } /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker * should be seen as a none pte; it's just that we have stored some information * onto the none pte so it becomes not-none any more. * * It should be used when the pte is file-backed, ram-based and backing * userspace pages, like shmem. It is not needed upon pgtables that do not * support pte markers at all. For example, it's not needed on anonymous * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. */ static inline int pte_none_mostly(pte_t pte) { return pte_none(pte) || is_pte_marker(pte); } static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding page is locked */ BUG_ON(is_migration_entry(entry) && !PageLocked(p)); return p; } static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) { struct folio *folio = pfn_folio(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding folio is locked */ BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); return folio; } /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They can either be used to represent unaddressable device * memory, to restrict access to a page undergoing migration or to represent a * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { /* Make sure the swp offset can always store the needed fields */ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page); extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new); extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { swp_entry_t arch_entry; if (pmd_swp_soft_dirty(pmd)) pmd = pmd_swp_clear_soft_dirty(pmd); if (pmd_swp_uffd_wp(pmd)) pmd = pmd_swp_clear_uffd_wp(pmd); arch_entry = __pmd_to_swp_entry(pmd); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pmd(arch_entry); } static inline int is_pmd_migration_entry(pmd_t pmd) { return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { BUILD_BUG(); } static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { BUILD_BUG(); } static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { return swp_entry(0, 0); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } static inline int is_pmd_migration_entry(pmd_t pmd) { return 0; } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ |
| 10 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FILEATTR_H #define _LINUX_FILEATTR_H /* Flags shared betwen flags/xflags */ #define FS_COMMON_FL \ (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NODUMP_FL | FS_NOATIME_FL | FS_DAX_FL | \ FS_PROJINHERIT_FL) #define FS_XFLAG_COMMON \ (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | \ FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \ FS_XFLAG_PROJINHERIT) /* * Merged interface for miscellaneous file attributes. 'flags' originates from * ext* and 'fsx_flags' from xfs. There's some overlap between the two, which * is handled by the VFS helpers, so filesystems are free to implement just one * or both of these sub-interfaces. */ struct fileattr { u32 flags; /* flags (FS_IOC_GETFLAGS/FS_IOC_SETFLAGS) */ /* struct fsxattr: */ u32 fsx_xflags; /* xflags field value (get/set) */ u32 fsx_extsize; /* extsize field value (get/set)*/ u32 fsx_nextents; /* nextents field value (get) */ u32 fsx_projid; /* project identifier (get/set) */ u32 fsx_cowextsize; /* CoW extsize field value (get/set)*/ /* selectors: */ bool flags_valid:1; bool fsx_valid:1; }; int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa); void fileattr_fill_xflags(struct fileattr *fa, u32 xflags); void fileattr_fill_flags(struct fileattr *fa, u32 flags); /** * fileattr_has_fsx - check for extended flags/attributes * @fa: fileattr pointer * * Return: true if any attributes are present that are not represented in * ->flags. */ static inline bool fileattr_has_fsx(const struct fileattr *fa) { return fa->fsx_valid && ((fa->fsx_xflags & ~FS_XFLAG_COMMON) || fa->fsx_extsize != 0 || fa->fsx_projid != 0 || fa->fsx_cowextsize != 0); } int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); #endif /* _LINUX_FILEATTR_H */ |
| 125 78 98 32 32 24 32 10 25 32 32 31 24 30 3 31 23 31 1 32 24 32 32 36 15 12 37 23 23 37 5 32 32 61 53 61 61 61 61 8 8 61 60 15 49 2 1 48 48 6 5 42 48 45 47 12 26 27 5 27 12 12 27 28 1 1 99 65 99 91 78 78 92 32 32 61 61 66 66 66 234 235 234 67 67 65 65 62 205 8 37 37 37 388 253 387 367 34 389 86 387 5 388 141 12 141 140 141 140 140 1 138 86 79 32 17 31 125 121 124 125 94 126 125 121 120 60 121 121 86 49 31 35 78 42 90 70 19 17 17 51 66 22 22 22 13 8 9 47 49 13 14 13 12 2 1 11 10 10 10 10 13 109 109 109 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include <linux/init.h> #include <linux/mm.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/suspend.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/module.h> #include <linux/io_uring/cmd.h> #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ if (iocb_is_dsync(iocb)) opf |= REQ_FUA; return opf; } static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, struct iov_iter *iter) { return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); if (!vecs) return -ENOMEM; } if (iov_iter_rw(iter) == READ) { bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ); if (user_backed_iter(iter)) should_dirty = true; } else { bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_write_stream = iocb->ki_write_stream; bio.bi_ioprio = iocb->ki_ioprio; if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == WRITE) task_io_account_write(ret); if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; submit_bio_wait(&bio); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); out: if (vecs != inline_vecs) kfree(vecs); bio_uninit(&bio); return ret; } enum { DIO_SHOULD_DIRTY = 1, DIO_IS_SYNC = 2, }; struct blkdev_dio { union { struct kiocb *iocb; struct task_struct *waiter; }; size_t size; atomic_t ref; unsigned int flags; struct bio bio ____cacheline_aligned_in_smp; }; static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) bio_integrity_unmap_user(bio); if (atomic_dec_and_test(&dio->ref)) { if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret); bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); blk_wake_io_task(waiter); } } if (should_dirty) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; bool is_read = (iov_iter_rw(iter) == READ), is_sync; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); loff_t pos = iocb->ki_pos; int ret = 0; if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); /* * Grab an extra reference to ensure the dio structure which is embedded * into the first bio stays around. */ bio_get(bio); is_sync = is_sync_kiocb(iocb); if (is_sync) { dio->flags = DIO_IS_SYNC; dio->waiter = current; } else { dio->flags = 0; dio->iocb = iocb; } dio->size = 0; if (is_read && user_backed_iter(iter)) dio->flags |= DIO_SHOULD_DIRTY; blk_start_plug(&plug); for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_write_stream = iocb->ki_write_stream; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } if (iocb->ki_flags & IOCB_NOWAIT) { /* * This is nonblocking IO, and we need to allocate * another bio if we have data left to map. As we * cannot guarantee that one of the sub bios will not * fail getting issued FOR NOWAIT and as error results * are coalesced across all of them, be safe and ask for * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { ret = -EAGAIN; goto fail; } bio->bi_opf |= REQ_NOWAIT; } if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { ret = bio_integrity_map_iter(bio, iocb->private); if (unlikely(ret)) goto fail; } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) bio_set_pages_dirty(bio); } else { task_io_account_write(bio->bi_iter.bi_size); } dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { submit_bio(bio); break; } atomic_inc(&dio->ref); submit_bio(bio); bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL); } blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; blk_io_schedule(); } __set_current_state(TASK_RUNNING); if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; bio_put(&dio->bio); return ret; fail: bio_release_pages(bio, false); bio_clear_flag(bio, BIO_REFFED); bio_put(bio); blk_finish_plug(&plug); return ret; } static void blkdev_bio_end_io_async(struct bio *bio) { struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!bio->bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(bio->bi_status); } if (iocb->ki_flags & IOCB_HAS_METADATA) bio_integrity_unmap_user(bio); iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; struct bio *bio; loff_t pos = iocb->ki_pos; int ret = 0; if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_write_stream = iocb->ki_write_stream; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; if (iov_iter_is_bvec(iter)) { /* * Users don't rely on the iterator being in any particular * state for async I/O returning -EIOCBQUEUED, hence we can * avoid expensive iov_iter_advance(). Bypass * bio_iov_iter_get_pages() and set the bvec directly. */ bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) goto out_bio_put; } dio->size = bio->bi_iter.bi_size; if (is_read) { if (user_backed_iter(iter)) { dio->flags |= DIO_SHOULD_DIRTY; bio_set_pages_dirty(bio); } } else { task_io_account_write(bio->bi_iter.bi_size); } if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); WRITE_ONCE(iocb->private, NULL); if (unlikely(ret)) goto out_bio_put; } if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) { bio->bi_opf |= REQ_POLLED; submit_bio(bio); WRITE_ONCE(iocb->private, bio); } else { submit_bio(bio); } return -EIOCBQUEUED; out_bio_put: bio_put(bio); return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; if (iov_iter_rw(iter) == WRITE) { u16 max_write_streams = bdev_max_write_streams(bdev); if (iocb->ki_write_stream) { if (iocb->ki_write_stream > max_write_streams) return -EINVAL; } else if (max_write_streams) { enum rw_hint write_hint = file_inode(iocb->ki_filp)->i_write_hint; /* * Just use the write hint as write stream for block * device writes. This assumes no file system is * mounted that would use the streams differently. */ if (write_hint <= max_write_streams) iocb->ki_write_stream = write_hint; } } nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); } else if (iocb->ki_flags & IOCB_ATOMIC) { return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = I_BDEV(inode); loff_t isize = i_size_read(inode); if (offset >= isize) return -EIO; iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ return 0; } static const struct iomap_ops blkdev_iomap_ops = { .iomap_begin = blkdev_iomap_begin, }; #ifdef CONFIG_BUFFER_HEAD static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } /* * We cannot call mpage_writepages() as it does not take the buffer lock. * We must use block_write_full_folio() directly which holds the buffer * lock. The buffer lock provides the synchronisation with writeback * that filesystems rely on when they use the blockdev's mapping. */ static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct folio *folio = NULL; struct blk_plug plug; int err; blk_start_plug(&plug); while ((folio = writeback_iter(mapping, wbc, folio, &err))) err = block_write_full_folio(folio, wbc, blkdev_get_block); blk_finish_plug(&plug); return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, blkdev_get_block); } static void blkdev_readahead(struct readahead_control *rac) { mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { int ret; ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); folio_unlock(folio); folio_put(folio); return ret; } const struct address_space_operations def_blk_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; #else /* CONFIG_BUFFER_HEAD */ static int blkdev_read_folio(struct file *file, struct folio *folio) { return iomap_read_folio(folio, &blkdev_iomap_ops); } static void blkdev_readahead(struct readahead_control *rac) { iomap_readahead(rac, &blkdev_iomap_ops); } static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned int len) { loff_t isize = i_size_read(inode); if (WARN_ON_ONCE(offset >= isize)) return -EIO; if (offset >= wpc->iomap.offset && offset < wpc->iomap.offset + wpc->iomap.length) return 0; return blkdev_iomap_begin(inode, offset, isize - offset, IOMAP_WRITE, &wpc->iomap, NULL); } static const struct iomap_writeback_ops blkdev_writeback_ops = { .map_blocks = blkdev_map_blocks, }; static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct iomap_writepage_ctx wpc = { }; return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); } const struct address_space_operations def_blk_aops = { .dirty_folio = filemap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = bdev_file_inode(file); loff_t retval; inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); inode_unlock(bd_inode); return retval; } static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; error = file_write_and_wait_range(filp, start, end); if (error) return error; /* * There is no need to serialise calls to blkdev_issue_flush with * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ error = blkdev_issue_flush(bdev); if (error == -EOPNOTSUPP) error = 0; return error; } /** * file_to_blk_mode - get block open flags from file flags * @file: file whose open flags should be converted * * Look at file open flags and generate corresponding block open flags from * them. The function works both for file just being open (e.g. during ->open * callback) and for file that is already open. This is actually non-trivial * (see comment in the function). */ blk_mode_t file_to_blk_mode(struct file *file) { blk_mode_t mode = 0; if (file->f_mode & FMODE_READ) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; /* * do_dentry_open() clears O_EXCL from f_flags, use file->private_data * to determine whether the open was exclusive for already open files. */ if (file->private_data) mode |= BLK_OPEN_EXCL; else if (file->f_flags & O_EXCL) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) mode |= BLK_OPEN_NDELAY; /* * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy * driver has historically allowed ioctls as if the file was opened for * writing, but does not allow and actual reads or writes. */ if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY)) mode |= BLK_OPEN_WRITE_IOCTL; return mode; } static int blkdev_open(struct inode *inode, struct file *filp) { struct block_device *bdev; blk_mode_t mode; int ret; mode = file_to_blk_mode(filp); /* Use the file as the holder. */ if (mode & BLK_OPEN_EXCL) filp->private_data = filp; ret = bdev_permission(inode->i_rdev, mode, filp->private_data); if (ret) return ret; bdev = blkdev_get_no_open(inode->i_rdev, true); if (!bdev) return -ENXIO; if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev); return ret; } static int blkdev_release(struct inode *inode, struct file *filp) { bdev_release(filp); return 0; } static ssize_t blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) { size_t count = iov_iter_count(from); ssize_t written; written = kiocb_invalidate_pages(iocb, count); if (written) { if (written == -EBUSY) return 0; return written; } written = blkdev_direct_IO(iocb, from); if (written > 0) { kiocb_invalidate_post_direct_write(iocb, count); iocb->ki_pos += written; count -= written; } if (written != -EIOCBQUEUED) iov_iter_revert(from, count - iov_iter_count(from)); return written; } static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL); } /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. * * Does not take i_mutex for the write and thus is not for general purpose * use. */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(bd_inode); bool atomic = iocb->ki_flags & IOCB_ATOMIC; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; ssize_t ret; if (bdev_read_only(bdev)) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) return 0; if (iocb->ki_pos >= size) return -ENOSPC; if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; if (atomic) { ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } size -= iocb->ki_pos; if (iov_iter_count(from) > size) { if (atomic) return -EINVAL; shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { /* * Take i_rwsem and invalidate_lock to avoid racing with * set_blocksize changing i_blkbits/folio order and punching * out the pagecache. */ inode_lock_shared(bd_inode); ret = blkdev_buffered_write(iocb, from); inode_unlock_shared(bd_inode); } if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); return ret; } static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *bd_inode = bdev_file_inode(iocb->ki_filp); struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret = 0; size_t count; if (unlikely(pos + iov_iter_count(to) > size)) { if (pos >= size) return 0; size -= pos; shorted = iov_iter_count(to) - size; iov_iter_truncate(to, size); } count = iov_iter_count(to); if (!count) goto reexpand; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { ret = kiocb_write_and_wait(iocb, count); if (ret < 0) goto reexpand; file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); if (ret > 0) { iocb->ki_pos += ret; count -= ret; } if (ret != -EIOCBQUEUED) iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) goto reexpand; } /* * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize * changing i_blkbits/folio order and punching out the pagecache. */ inode_lock_shared(bd_inode); ret = filemap_read(iocb, to, ret); inode_unlock_shared(bd_inode); reexpand: if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_ZERO_RANGE) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct inode *inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = bdev_nr_bytes(bdev); if (start >= isize) return -EINVAL; if (end >= isize) { if (mode & FALLOC_FL_KEEP_SIZE) { len = isize - start; end = start + len - 1; } else return -EINVAL; } /* * Don't allow IO that isn't aligned to logical block size. */ if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); /* * Invalidate the page cache, including dirty pages, for valid * de-allocate mode calls to fallocate(). */ switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; default: error = -EOPNOTSUPP; } fail: filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); return error; } static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(file); if (bdev_read_only(I_BDEV(bd_inode))) return generic_file_readonly_mmap(file, vma); return generic_file_mmap(file, vma); } const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_release, .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = iocb_bio_iopoll, .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC, }; static __init int blkdev_init(void) { return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); } module_init(blkdev_init); |
| 3 3 3 3 3 3 3 2 1 1 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 617 617 617 617 1 23 720 720 720 325 31 9 23 23 23 14 102 102 103 102 1 103 103 27 27 27 27 2 2 2 2 2 2 2 2 2 2 2 27 2 2 2 2 2 2 82 82 40 7 6 7 7 7 7 7 4 4 4 4 4 4 4 4 4 4 4 7 7 7 6 7 7 7 6 7 5 7 7 3 3 3 2 27 28 28 14 14 14 14 2 13 7 7 7 14 28 28 28 707 20 65 67 67 65 51 50 35 50 50 49 49 67 66 67 67 66 629 630 630 159 159 159 159 718 2 2 2 2 2 2 502 502 81 81 50 50 499 2 40 40 40 40 40 40 40 40 40 21 21 16 26 287 287 286 287 286 252 287 611 609 609 154 154 153 154 154 154 357 349 627 446 448 448 447 447 40 446 286 279 279 502 630 630 628 629 548 629 39 39 39 39 39 39 2 2 2 4 7 7 7 7 6 6 2 1 7 4 4 4 3 4 4 4 4 4 4 1 1 2 7 3 1 3 3 65 672 668 673 668 146 4 666 662 594 594 595 592 74 74 547 515 591 194 501 365 30 364 356 357 357 630 630 623 82 56 53 82 74 72 629 705 492 711 711 719 638 635 488 9 8 628 579 673 193 190 190 125 72 23 124 50 1 49 17 657 95 76 95 616 157 71 70 24 71 98 502 496 347 349 498 494 64 673 719 657 656 599 18 18 18 18 18 18 1 18 18 18 1 18 18 18 18 18 8 7 18 18 9 9 9 9 9 9 18 18 9 9 8 9 1 1 18 18 1 9 9 8 9 9 9 9 9 9 9 9 9 9 19 19 19 19 19 19 19 9 19 19 14 14 12 1 1 17 1 18 17 17 18 18 17 16 16 16 16 16 1 15 16 4 14 1 14 1 14 14 4 14 1 14 1 14 1 14 5 4 4 1 9 1 9 10 2 10 2 10 10 1 1 1 9 9 9 6 167 1 1 1 615 615 615 1 615 615 615 1 1 19 19 615 615 615 615 615 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> #include "fib_lookup.h" #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ) #define DEFAULT_MIN_ADVMSS 256 static int ip_rt_max_size; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); return NULL; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #else #define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) #endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos) return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { } static int rt_cache_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); return 0; } static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show, }; static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } (*pos)++; return NULL; } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) { } static int rt_cpu_seq_show(struct seq_file *seq, void *v) { struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, 0, /* st->gc_total */ 0, /* st->gc_ignored */ 0, /* st->gc_goal_miss */ 0, /* st->gc_dst_overflow */ 0, /* st->in_hlist_search */ 0 /* st->out_hlist_search */ ); return 0; } static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show, }; #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; unsigned int i, j; dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); if (!dst) return -ENOMEM; for_each_possible_cpu(i) { src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); for (j = 0; j < 256; j++) { dst[j].o_bytes += src[j].o_bytes; dst[j].o_packets += src[j].o_packets; dst[j].i_bytes += src[j].i_bytes; dst[j].i_packets += src[j].i_packets; } } seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); kfree(dst); return 0; } #endif static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; pde = proc_create_seq("rt_cache", 0444, net->proc_net, &rt_cache_seq_ops); if (!pde) goto err1; pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_ops); if (!pde) goto err2; #ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create_single("rt_acct", 0, net->proc_net, rt_acct_proc_show); if (!pde) goto err3; #endif return 0; #ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif err2: remove_proc_entry("rt_cache", net->proc_net); err1: return -ENOMEM; } static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); #ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, }; static int __init ip_rt_proc_init(void) { return register_pernet_subsys(&ip_rt_proc_ops); } #else static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ static inline bool rt_is_expired(const struct rtable *rth) { bool res; rcu_read_lock(); res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); rcu_read_unlock(); return res; } void rt_cache_flush(struct net *net) { rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; struct neighbour *n; rcu_read_lock(); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { n = ip_neigh_gw6(dev, &rt->rt_gw6); } else { __be32 pkey; pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); n = ip_neigh_gw4(dev, pkey); } if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst->dev; const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { pkey = (const __be32 *)&rt->rt_gw4; } else if (rt->rt_gw_family == AF_INET6) { return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); } else if (!daddr || (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } /* Hash tables of size 2048..262144 depending on RAM size. * Each bucket uses 8 bytes. */ static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker * to infer how many packets were sent between two points in time. */ static u32 ip_idents_reserve(u32 hash, int segs) { u32 bucket, old, now = (u32)jiffies; atomic_t *p_id; u32 *p_tstamp; u32 delta = 0; bucket = hash & ip_idents_mask; p_tstamp = ip_tstamps + bucket; p_id = ip_idents + bucket; old = READ_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = get_random_u32_below(now - old); /* If UBSAN reports an error there, please make sure your compiler * supports -fno-strict-overflow before reporting it that was a bug * in UBSAN, and it has been fixed in GCC-8. */ return atomic_add_return(segs + delta, p_id) - segs; } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { u32 hash, id; /* Note the following code is not safe, but this is okay. */ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) get_random_bytes(&net->ipv4.ip_id_key, sizeof(net->ipv4.ip_id_key)); hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, int flow_flags) { __u8 scope = RT_SCOPE_UNIVERSE; if (sk) { oif = sk->sk_bound_dev_if; mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk->sk_uid); rcu_read_unlock(); } static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct sk_buff *skb) { if (skb) build_skb_flow_key(fl4, skb, sk); else build_sk_flow_key(fl4, sk); } static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { struct rtable *rt; rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } } static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception __rcu **fnhe_p, **oldest_p; struct fib_nh_exception *fnhe, *oldest = NULL; for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); if (!fnhe) break; if (!oldest || time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; oldest_p = fnhe_p; } } fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); } static u32 fnhe_hashfun(__be32 daddr) { static siphash_aligned_key_t fnhe_hash_key; u64 hval; net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } } static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, __be32 gw, u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; u32 genid, hval; unsigned int i; int depth; genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; depth = 0; for (fnhe = rcu_dereference(hash->chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) break; depth++; } if (fnhe) { if (fnhe->fnhe_genid != genid) fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; if (pmtu) { fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) fill_route_from_fnhe(rt, fnhe); rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) fill_route_from_fnhe(rt, fnhe); } else { /* Randomize max depth to avoid some side channels attacks. */ int max_depth = FNHE_RECLAIM_DEPTH + get_random_u32_below(FNHE_RECLAIM_DEPTH); while (depth > max_depth) { fnhe_remove_oldest(hash); depth--; } fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); if (!fnhe) goto out_unlock; fnhe->fnhe_next = hash->chain; fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); rcu_assign_pointer(hash->chain, fnhe); /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; for_each_possible_cpu(i) { struct rtable __rcu **prt; prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) rt->dst.obsolete = DST_OBSOLETE_KILL; } } fnhe->fnhe_stamp = jiffies; out_unlock: spin_unlock_bh(&fnhe_lock); } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct neighbour *n; struct net *net; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: break; default: return; } if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(READ_ONCE(n->nud_state) & NUD_VALID)) { neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, skb); nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } if (kill_route) rt->dst.obsolete = DST_OBSOLETE_KILL; call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) { const struct iphdr *iph = (const struct iphdr *) skb->data; __be32 daddr = iph->daddr; __be32 saddr = iph->saddr; net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); } #endif ; } static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); if ((dst->obsolete > 0) || (rt->rt_flags & RTCF_REDIRECTED) || rt->dst.expires) sk_dst_reset(sk); } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; struct net *net; int log_martians; int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; peer->n_redirects = 0; } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_unlock; } /* Check for load limit; set rate_last to the latest sent * redirect. */ if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } out_unlock: rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = skb->dev; struct in_device *in_dev; struct inet_peer *peer; unsigned long now; struct net *net; SKB_DR(reason); bool send; int code; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) goto out; } in_dev = __in_dev_get_rcu(dev); /* IP on this device is disabled. */ if (!in_dev) goto out; net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: SKB_DR_SET(reason, IP_INADDRERRORS); __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; } switch (rt->dst.error) { case EINVAL: default: goto out; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; break; case ENETUNREACH: code = ICMP_NET_UNREACH; SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; peer->rate_tokens += now - peer->rate_last; if (peer->rate_tokens > ip_rt_error_burst) peer->rate_tokens = ip_rt_error_burst; peer->rate_last = now; if (peer->rate_tokens >= ip_rt_error_cost) peer->rate_tokens -= ip_rt_error_cost; else send = false; } rcu_read_unlock(); if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb_reason(skb, reason); return 0; } static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct fib_result res; bool lock = false; struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) return; old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; rcu_read_lock(); net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); } if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) goto out; if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res.fi) > 1) { int nhsel; for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { nhc = fib_info_nhc(res.fi, nhsel); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } out: rcu_read_unlock(); } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); /* Don't make lookup fail for bridged encapsulations */ if (skb && netif_is_any_bridge_port(skb->dev)) fl4.flowi4_oif = 0; __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; struct net *net = sock_net(sk); bh_lock_sock(sk); if (!ip_sk_accept_pmtu(sk)) goto out; odst = sk_dst_get(sk); if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } if (new) sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { struct net_device *dev; struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. * Also check we have a reasonable ipv4 header. */ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; memset(&opt, 0, sizeof(opt)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) return; } __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); } static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; } /* * We do not cache source address of outgoing interface, * because it is used only by IP RR, TS and SRR options, * so that it out of fast path. * * BTW remember: "addr" is allowed to be not aligned * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; if (rt_is_output_route(rt)) src = ip_hdr(skb)->saddr; else { struct fib_result res; struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); } #ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) rt->dst.tclassid |= tag & 0xFFFF; if (!(rt->dst.tclassid & 0xFFFF0000)) rt->dst.tclassid |= tag & 0xFFFF0000; } #endif static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); unsigned int advmss; struct net *net; rcu_read_lock(); net = dev_net_rcu(dst->dev); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; u32 hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; fnhe_p = &hash->chain; fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); while (fnhe) { if (fnhe->fnhe_daddr == daddr) { rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); /* set fnhe_daddr to 0 to ensure it won't bind with * new dsts in rt_bind_exception(). */ fnhe->fnhe_daddr = 0; fnhe_flush_routes(fnhe); kfree_rcu(fnhe, rcu); break; } fnhe_p = &fnhe->fnhe_next; fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)); } spin_unlock_bh(&fnhe_lock); } static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; if (!hash) return NULL; hval = fnhe_hashfun(daddr); for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { ip_del_fnhe(nhc, daddr); break; } return fnhe; } } return NULL; } /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device */ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { struct fib_nh_common *nhc = res->nhc; struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; u32 mtu = 0; if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; if (likely(!mtu)) { struct fib_nh_exception *fnhe; fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { bool ret = false; spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { struct rtable __rcu **porig; struct rtable *orig; int genid = fnhe_genid(dev_net(rt->dst.dev)); if (rt_is_input_route(rt)) porig = &fnhe->fnhe_rth_input; else porig = &fnhe->fnhe_rth_output; orig = rcu_dereference(*porig); if (fnhe->fnhe_genid != genid) { fnhe->fnhe_genid = genid; fnhe->fnhe_gw = 0; fnhe->fnhe_pmtu = 0; fnhe->fnhe_expires = 0; fnhe->fnhe_mtu_locked = false; fnhe_flush_routes(fnhe); orig = NULL; } fill_route_from_fnhe(rt, fnhe); if (!rt->rt_gw4) { rt->rt_gw4 = daddr; rt->rt_gw_family = AF_INET; } if (do_cache) { dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); } ret = true; } fnhe->fnhe_stamp = jiffies; } spin_unlock_bh(&fnhe_lock); return ret; } static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nhc->nhc_rth_input; } else { p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; /* hold dst before doing cmpxchg() to avoid race condition * on this dst */ dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { dst_release(&rt->dst); ret = false; } return ret; } struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); void rt_add_uncached_list(struct rtable *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt_del_uncached_list(struct rtable *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void ipv4_dst_destroy(struct dst_entry *dst) { ip_dst_metrics_put(dst); rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) { struct rtable *rt, *safe; int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static bool rt_cache_valid(const struct rtable *rt) { return rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag, const bool do_cache) { bool cached = false; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) rt->rt_gw4 = nhc->nhc_gw.ipv4; else rt->rt_gw6 = nhc->nhc_gw.ipv6; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); rt->dst.tclassid = nh->nh_tclassid; } #endif rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ if (!rt->rt_gw4) { rt->rt_gw_family = AF_INET; rt->rt_gw4 = daddr; } rt_add_uncached_list(rt); } } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif } struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->rt_flags = flags; rt->rt_type = type; rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; } EXPORT_SYMBOL(rt_dst_alloc); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) { struct rtable *new_rt; new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, rt->dst.flags); if (new_rt) { new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); new_rt->rt_flags = rt->rt_flags; new_rt->rt_type = rt->rt_type; new_rt->rt_is_input = rt->rt_is_input; new_rt->rt_iif = rt->rt_iif; new_rt->rt_pmtu = rt->rt_pmtu; new_rt->rt_mtu_locked = rt->rt_mtu_locked; new_rt->rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) new_rt->rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; new_rt->dst.input = rt->dst.input; new_rt->dst.output = rt->dst.output; new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); } return new_rt; } EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) return SKB_DROP_REASON_NOT_SPECIFIED; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) return SKB_DROP_REASON_IP_INVALID_SOURCE; if (skb->protocol != htons(ETH_P_IP)) return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) return reason; } return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ static enum skb_drop_reason ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (reason) return reason; if (our) flags |= RTCF_LOCAL; if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; rth->rt_is_input= 1; #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return SKB_NOT_DROPPED_YET; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ pr_warn("martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), dev->hard_header_len, false); } } #endif } /* called in rcu_read_lock() section */ static enum skb_drop_reason __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; bool do_cache; u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); goto cleanup; } do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { __be32 gw; gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. * * Proxy arp feature have been extended to allow, ARP * replies back to the same interface, to support * Private VLAN switch technologies. See arp.c. */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; } } rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { reason = SKB_DROP_REASON_NOMEM; goto cleanup; } rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: reason = SKB_NOT_DROPPED_YET; cleanup: return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH /* To make ICMP packets follow the right flow, the multipath hash is * calculated from the inner IP addresses. */ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; if (likely(outer_iph->protocol != IPPROTO_ICMP)) goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; out: hash_keys->addrs.v4addrs.src = key_iph->saddr; hash_keys->addrs.v4addrs.dst = key_iph->daddr; } static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = fl4->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl4->fl4_sport; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash = 0; switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (skb) { ip_multipath_l3_keys(skb, &hash_keys); } else { hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); /* skb is currently provided only when forwarding */ if (skb) { struct flow_keys keys; skb_flow_dissect_flow_keys(skb, &keys, 0); /* Inner can be v4 or v6 */ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; hash_keys.tags.flow_label = keys.tags.flow_label; hash_keys.basic.ip_proto = keys.basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; ip_multipath_l3_keys(skb, &hash_keys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = fib_multipath_custom_hash_skb(net, skb); else mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); return mhash >> 1; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static enum skb_drop_reason ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif /* create a routing cache entry */ return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp); } /* Implements all the saddr-related checks as ip_route_input_slow(), * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ enum skb_drop_reason ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); u32 tag = 0; if (!in_dev) return reason; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &tag); if (reason) goto martian_source; skip_validate_source: skb_dst_copy(skb, hint); return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); return reason; } /* get device for dst_alloc with local routes */ static struct net_device *ip_rt_get_dev(struct net *net, const struct fib_result *res) { struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; struct net_device *dev = NULL; if (nhc) dev = l3mdev_master_dev_rcu(nhc->nhc_dev); return dev ? : net->loopback_dev; } /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * Changes in the enforced policies must be applied also to * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. * called with rcu_read_lock() */ static enum skb_drop_reason ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; struct flowi4 fl4; bool do_cache = true; /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected * by fib_lookup. */ tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } res->fi = NULL; res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(daddr)) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; } } else if (ipv4_is_loopback(saddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } } /* * Now we are ready to route packet. */ fl4.flowi4_l3mdev = 0; fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; } else { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; /* not do cache if bc_forwarding is enabled */ if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING)) do_cache = false; goto brd_input; } err = -EINVAL; if (res->type == RTN_LOCAL) { reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) { err = -EHOSTUNREACH; goto no_route; } if (res->type != RTN_UNICAST) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } make_route: reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); out: return reason; brd_input: if (skb->protocol != htons(ETH_P_IP)) { reason = SKB_DROP_REASON_INVALID_PROTO; goto out; } if (!ipv4_is_zeronet(saddr)) { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; } flags |= RTCF_BROADCAST; res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; } } rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; no_route: RT_CACHE_STAT_INC(in_no_route); res->type = RTN_UNREACHABLE; res->fi = NULL; res->table = NULL; goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif goto out; e_nobufs: reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } /* called with rcu_read_lock held */ static enum skb_drop_reason ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing * hardware multicast filters :-( As result the host on multicasting * network acquires a lot of useless route cache entries, sort of * SDR messages from all the world. Now we try to get rid of them. * Really, provided software IP multicast filter is organized * reasonably (at least, hashed), it does not result in a slowdown * comparing with route cache reject entries. * Note, that multicast routers are not affected, because * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; if (!in_dev) return reason; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); /* check l3 master if no match yet */ if (!our && netif_is_l3_slave(dev)) { struct in_device *l3_in_dev; l3_in_dev = __in_dev_get_rcu(skb->dev); if (l3_in_dev) our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, ip_hdr(skb)->protocol); } if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { reason = ip_route_input_mc(skb, daddr, saddr, dscp, dev, our); } return reason; } return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; rcu_read_lock(); reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); return reason; } EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) type = RTN_BROADCAST; else if (ipv4_is_multicast(fl4->daddr)) type = RTN_MULTICAST; else if (ipv4_is_zeronet(fl4->daddr)) return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; else do_cache = false; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ if (fi && res->prefixlen < 4) fi = NULL; } else if ((type == RTN_LOCAL) && (orig_oif != 0) && (orig_oif != dev_out->ifindex)) { /* For local routes that require a particular output interface * we do not want to cache the result. Caching the result * causes incorrect behaviour when there are multiple source * addresses on the interface, the end result being that if the * intended recipient is waiting on that interface for the * packet he won't receive it because it will be delivered on * the loopback interface and the IP_PKTINFO ipi_ifindex will * be set to the loopback interface as well. */ do_cache = false; } fnhe = NULL; do_cache &= fi != NULL; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { prth = &fnhe->fnhe_rth_output; } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; } add: rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; RT_CACHE_STAT_INC(out_slow_tot); if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } } #endif } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); lwtunnel_set_redirect(&rth->dst); return rth; } /* * Major route resolver routine. */ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { struct fib_result res = { .type = RTN_UNSPEC, .fi = NULL, .table = NULL, .tclassid = 0, }; struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); rcu_read_unlock(); return rth; } EXPORT_SYMBOL_GPL(ip_route_output_key_hash); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) { struct net_device *dev_out = NULL; int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; int err; if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. * It was wrong for two reasons: * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr * is assigned to multiple interfaces. * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && (ipv4_is_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); if (!dev_out) goto out; /* Special hack: user can direct multicasts * and limited broadcast via necessary interface * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. * This hack is not just for fun, it allows * vic,vat and friends to work. * They bind socket to loopback, set ttl to zero * and expect that it will work. * From the viewpoint of routing cache they are broken, * because we are not allowed to build multicast path * with loopback source addr (look, routing cache * cannot know, that ttl is zero, so that packet * will not leave this host and route is valid). * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; goto make_route; } if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (!dev_out) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); goto out; } if (ipv4_is_local_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr) || fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); else if (!fl4->daddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl4->daddr) { fl4->daddr = fl4->saddr; if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } err = fib_lookup(net, fl4, res, 0); if (err) { res->fi = NULL; res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) { /* Apparently, routing tables are wrong. Assume, * that the destination is on link. * * WHY? DW. * Because we are allowed to send to iface * even if it has NO routes and NO assigned * addresses. When oif is specified, routing * tables are looked up with only one purpose: * to catch if destination is gatewayed, rather than * direct. Moreover, if MSG_DONTROUTE is set, * we send packet, ignoring both routing tables * and ifaddr state. --ANK * * * We could make it even if oif is unknown, * likely IPv6, but we do not. */ if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } if (res->type == RTN_LOCAL) { if (!fl4->saddr) { if (res->fi->fib_prefsrc) fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; /* make sure orig_oif points to fib result device even * though packet rx/tx happens over loopback or l3mdev */ orig_oif = FIB_RES_OIF(*res); fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: return rth; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .default_advmss = ipv4_default_advmss, .neigh_lookup = ipv4_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; new->dev = net->loopback_dev; netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; else if (rt->rt_gw_family == AF_INET6) rt->rt_gw6 = ort->rt_gw6; } dst_release(dst_orig); return rt ? &rt->dst : ERR_PTR(-ENOMEM); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0)); } return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, dscp_t dscp, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; u32 error; u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = inet_dscp_to_dsfield(dscp); r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; if (rt->dst.lwtstate && lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway) { if (rt->rt_gw_family == AF_INET && nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; } else if (rt->rt_gw_family == AF_INET6) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &rt->rt_gw6, alen); } } expires = rt->dst.expires; if (expires) { unsigned long now = jiffies; if (time_before(now, expires)) expires -= now; else expires = 0; } memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rt->rt_mtu_locked && expires) metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (fl4) { if (fl4->flowi4_mark && nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && nla_put_u32(skb, RTA_UID, from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) goto nla_put_failure; if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, r, portid); if (err <= 0) { if (err == 0) return 0; goto nla_put_failure; } } else #endif if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) goto nla_put_failure; } } error = rt->dst.error; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, int *fa_index, int fa_start, unsigned int flags) { int i; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference(bucket[i].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { struct rtable *rt; int err; if (*fa_index < fa_start) goto next; if (fnhe->fnhe_genid != genid) goto next; if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) goto next; rt = rcu_dereference(fnhe->fnhe_rth_input); if (!rt) rt = rcu_dereference(fnhe->fnhe_rth_output); if (!rt) goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, 0, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) return err; next: (*fa_index)++; } } return 0; } int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); struct fnhe_hash_bucket *bucket; int err; if (nhc->nhc_flags & RTNH_F_DEAD) continue; rcu_read_lock(); bucket = rcu_dereference(nhc->nhc_exceptions); err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, genid, fa_index, fa_start, flags); rcu_read_unlock(); if (err) return err; } return 0; } static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) { struct sk_buff *skb; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; /* Reserve room for dummy headers, this skb can pass * through good chunk of routing engine. */ skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = ip_proto; iph->saddr = src; iph->daddr = dst; iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; skb_set_transport_header(skb, skb->len); switch (iph->protocol) { case IPPROTO_UDP: { struct udphdr *udph; udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } case IPPROTO_TCP: { struct tcphdr *tcph; tcph = skb_put_zero(skb, sizeof(struct tcphdr)); tcph->source = sport; tcph->dest = dport; tcph->doff = sizeof(struct tcphdr) / 4; tcph->rst = 1; tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), src, dst, 0); break; } case IPPROTO_ICMP: { struct icmphdr *icmph; icmph = skb_put_zero(skb, sizeof(struct icmphdr)); icmph->type = ICMP_ECHO; icmph->code = 0; } } return skb; } static int inet_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_NOTIFY | RTM_F_LOOKUP_TABLE | RTM_F_FIB_MATCH)) { NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_IIF: case RTA_OIF: case RTA_SRC: case RTA_DST: case RTA_IP_PROTO: case RTA_SPORT: case RTA_DPORT: case RTA_MARK: case RTA_UID: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); return -EINVAL; } } return 0; } static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; u32 table_id = RT_TABLE_MAIN; __be16 sport = 0, dport = 0; struct fib_result res = {}; u8 ip_proto = IPPROTO_UDP; struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; dscp_t dscp; kuid_t uid; u32 iif; int err; int mark; err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); src = nla_get_in_addr_default(tb[RTA_SRC], 0); dst = nla_get_in_addr_default(tb[RTA_DST], 0); iif = nla_get_u32_default(tb[RTA_IIF], 0); mark = nla_get_u32_default(tb[RTA_MARK], 0); dscp = inet_dsfield_to_dscp(rtm->rtm_tos); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else uid = (iif ? INVALID_UID : current_uid()); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &ip_proto, AF_INET, extack); if (err) return err; } if (tb[RTA_SPORT]) sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) dport = nla_get_be16(tb[RTA_DPORT]); skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); if (!skb) return -ENOBUFS; fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) fl4.fl4_sport = sport; if (dport) fl4.fl4_dport = dport; fl4.flowi4_proto = ip_proto; rcu_read_lock(); if (iif) { struct net_device *dev; dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_rcu; } fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, dscp, dev, &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); else skb_dst_set(skb, &rt->dst); } if (err) goto errout_rcu; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = res.table ? res.table->tb_id : 0; /* reset skb for netlink reply msg */ skb_trim(skb, 0); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { struct fib_rt_info fri; if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } fri.fi = res.fi; fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; fri.dscp = res.dscp; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { u8 slen = 32 - fri.dst_len; if (fa->fa_slen == slen && fa->tb_id == fri.tb_id && fa->fa_dscp == fri.dscp && fa->fa_info == res.fi && fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); break; } } } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: return err; errout_rcu: rcu_read_unlock(); kfree_skb(skb); goto errout_free; } void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; if (write) { rt_cache_flush(net); fnhe_genid_bump(net); return 0; } return -EINVAL; } static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static const char ipv4_route_flush_procname[] = "flush"; static struct ctl_table ipv4_route_netns_table[] = { { .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, }, { .procname = "min_pmtu", .data = &init_net.ipv4.ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &ip_min_valid_pmtu, }, { .procname = "mtu_expires", .data = &init_net.ipv4.ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv4.ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; size_t table_size = ARRAY_SIZE(ipv4_route_netns_table); tbl = ipv4_route_netns_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL); if (!tbl) goto err_dup; /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; } /* Update the variables to point into the current struct net * except for the first element flush */ for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route", tbl, table_size); if (!net->ipv4.route_hdr) goto err_reg; return 0; err_reg: if (tbl != ipv4_route_netns_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_route_net_exit(struct net *net) { const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); BUG_ON(tbl == ipv4_route_netns_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, }; #endif static __net_init int netns_ip_rt_init(struct net *net) { /* Set default value for namespaceified sysctls */ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU; net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES; net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS; return 0; } static struct pernet_operations __net_initdata ip_rt_ops = { .init = netns_ip_rt_init, }; static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; static int __net_init ipv4_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv4.peers = bp; return 0; } static void __net_exit ipv4_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv4.peers; net->ipv4.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, }; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip_rt_init(void) { void *idents_hash; int cpu; /* For modern hosts, this will use 2 MB of memory */ idents_hash = alloc_large_system_hash("IP idents", sizeof(*ip_idents) + sizeof(*ip_tstamps), 0, 16, /* one bucket per 64 KB */ HASH_ZERO, NULL, &ip_idents_mask, 2048, 256*1024); ip_idents = idents_hash; get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable, SLAB_HWCACHE_ALIGN | SLAB_PANIC); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; if (dst_entries_init(&ipv4_dst_ops) < 0) panic("IP: failed to allocate ipv4_dst_ops counter\n"); if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); ipv4_dst_ops.gc_thresh = ~0; ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif rtnl_register_many(ip_rt_rtnl_msg_handlers); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&ip_rt_ops); register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); return 0; } #ifdef CONFIG_SYSCTL /* * We really need to sanitize the damn ipv4 init order, then all * this nonsense will go away. */ void __init ip_static_sysctl_init(void) { register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif |
| 20 20 20 20 20 8037 8027 8029 6864 8096 8028 8023 8097 8028 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor mediation of files * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/tty.h> #include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/mount.h> #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" #include "include/file.h" #include "include/match.h" #include "include/net.h" #include "include/path.h" #include "include/policy.h" #include "include/label.h" static u32 map_mask_to_chr_mask(u32 mask) { u32 m = mask & PERMS_CHRS_MASK; if (mask & AA_MAY_GETATTR) m |= MAY_READ; if (mask & (AA_MAY_SETATTR | AA_MAY_CHMOD | AA_MAY_CHOWN)) m |= MAY_WRITE; return m; } /** * file_audit_cb - call back for file specific audit fields * @ab: audit_buffer (NOT NULL) * @va: audit struct to audit values of (NOT NULL) */ static void file_audit_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); kuid_t fsuid = ad->subj_cred ? ad->subj_cred->fsuid : current_fsuid(); char str[10]; if (ad->request & AA_AUDIT_FILE_MASK) { aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, map_mask_to_chr_mask(ad->request)); audit_log_format(ab, " requested_mask=\"%s\"", str); } if (ad->denied & AA_AUDIT_FILE_MASK) { aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, map_mask_to_chr_mask(ad->denied)); audit_log_format(ab, " denied_mask=\"%s\"", str); } if (ad->request & AA_AUDIT_FILE_MASK) { audit_log_format(ab, " fsuid=%d", from_kuid(&init_user_ns, fsuid)); audit_log_format(ab, " ouid=%d", from_kuid(&init_user_ns, ad->fs.ouid)); } if (ad->peer) { audit_log_format(ab, " target="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, FLAG_VIEW_SUBNS, GFP_KERNEL); } else if (ad->fs.target) { audit_log_format(ab, " target="); audit_log_untrustedstring(ab, ad->fs.target); } } /** * aa_audit_file - handle the auditing of file operations * @subj_cred: cred of the subject * @profile: the profile being enforced (NOT NULL) * @perms: the permissions computed for the request (NOT NULL) * @op: operation being mediated * @request: permissions requested * @name: name of object being mediated (MAYBE NULL) * @target: name of target (MAYBE NULL) * @tlabel: target label (MAY BE NULL) * @ouid: object uid * @info: extra information message (MAYBE NULL) * @error: 0 if operation allowed else failure error code * * Returns: %0 or error on failure */ int aa_audit_file(const struct cred *subj_cred, struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error) { int type = AUDIT_APPARMOR_AUTO; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_FILE, op); ad.subj_cred = subj_cred; ad.request = request; ad.name = name; ad.fs.target = target; ad.peer = tlabel; ad.fs.ouid = ouid; ad.info = info; ad.error = error; ad.common.u.tsk = NULL; if (likely(!ad.error)) { u32 mask = perms->audit; if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL)) mask = 0xffff; /* mask off perms that are not being force audited */ ad.request &= mask; if (likely(!ad.request)) return 0; type = AUDIT_APPARMOR_AUDIT; } else { /* only report permissions that were denied */ ad.request = ad.request & ~perms->allow; AA_BUG(!ad.request); if (ad.request & perms->kill) type = AUDIT_APPARMOR_KILL; /* quiet known rejects, assumes quiet and kill do not overlap */ if ((ad.request & perms->quiet) && AUDIT_MODE(profile) != AUDIT_NOQUIET && AUDIT_MODE(profile) != AUDIT_ALL) ad.request &= ~perms->quiet; if (!ad.request) return ad.error; } ad.denied = ad.request & ~perms->allow; return aa_audit(type, profile, &ad, file_audit_cb); } static int path_name(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, char *buffer, const char **name, struct path_cond *cond, u32 request) { struct aa_profile *profile; const char *info = NULL; int error; error = aa_path_name(path, flags, buffer, name, &info, labels_profile(label)->disconnected); if (error) { fn_for_each_confined(label, profile, aa_audit_file(subj_cred, profile, &nullperms, op, request, *name, NULL, NULL, cond->uid, info, error)); return error; } return 0; } struct aa_perms default_perms = {}; /** * aa_lookup_fperms - convert dfa compressed perms to internal perms * @file_rules: the aa_policydb to lookup perms for (NOT NULL) * @state: state in dfa * @cond: conditions to consider (NOT NULL) * * TODO: convert from dfa + state to permission entry * * Returns: a pointer to a file permission set */ struct aa_perms *aa_lookup_fperms(struct aa_policydb *file_rules, aa_state_t state, struct path_cond *cond) { unsigned int index = ACCEPT_TABLE(file_rules->dfa)[state]; if (!(file_rules->perms)) return &default_perms; if (uid_eq(current_fsuid(), cond->uid)) return &(file_rules->perms[index]); return &(file_rules->perms[index + 1]); } /** * aa_str_perms - find permission that match @name * @file_rules: the aa_policydb to match against (NOT NULL) * @start: state to start matching in * @name: string to match against dfa (NOT NULL) * @cond: conditions to consider for permission set computation (NOT NULL) * @perms: Returns - the permissions found when matching @name * * Returns: the final state in @dfa when beginning @start and walking @name */ aa_state_t aa_str_perms(struct aa_policydb *file_rules, aa_state_t start, const char *name, struct path_cond *cond, struct aa_perms *perms) { aa_state_t state; state = aa_dfa_match(file_rules->dfa, start, name); *perms = *(aa_lookup_fperms(file_rules, state, cond)); return state; } static int __aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_profile *profile, const char *name, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); int e = 0; if (profile_unconfined(profile)) return 0; aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE], name, cond, perms); if (request & ~perms->allow) e = -EACCES; return aa_audit_file(subj_cred, profile, perms, op, request, name, NULL, NULL, cond->uid, NULL, e); } static int profile_path_perm(const char *op, const struct cred *subj_cred, struct aa_profile *profile, const struct path *path, char *buffer, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { const char *name; int error; if (profile_unconfined(profile)) return 0; error = path_name(op, subj_cred, &profile->label, path, flags | profile->path_flags, buffer, &name, cond, request); if (error) return error; return __aa_path_perm(op, subj_cred, profile, name, request, cond, flags, perms); } /** * aa_path_perm - do permissions check & audit for @path * @op: operation being checked * @subj_cred: subject cred * @label: profile being enforced (NOT NULL) * @path: path to check permissions of (NOT NULL) * @flags: any additional path flags beyond what the profile specifies * @request: requested permissions * @cond: conditional info for this request (NOT NULL) * * Returns: %0 else error if access denied or other error */ int aa_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond) { struct aa_perms perms = {}; struct aa_profile *profile; char *buffer = NULL; int error; flags |= PATH_DELEGATE_DELETED | (S_ISDIR(cond->mode) ? PATH_IS_DIR : 0); buffer = aa_get_buffer(false); if (!buffer) return -ENOMEM; error = fn_for_each_confined(label, profile, profile_path_perm(op, subj_cred, profile, path, buffer, request, cond, flags, &perms)); aa_put_buffer(buffer); return error; } /** * xindex_is_subset - helper for aa_path_link * @link: link permission set * @target: target permission set * * test target x permissions are equal OR a subset of link x permissions * this is done as part of the subset test, where a hardlink must have * a subset of permissions that the target has. * * Returns: true if subset else false */ static inline bool xindex_is_subset(u32 link, u32 target) { if (((link & ~AA_X_UNSAFE) != (target & ~AA_X_UNSAFE)) || ((link & AA_X_UNSAFE) && !(target & AA_X_UNSAFE))) return false; return true; } static int profile_path_link(const struct cred *subj_cred, struct aa_profile *profile, const struct path *link, char *buffer, const struct path *target, char *buffer2, struct path_cond *cond) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); const char *lname, *tname = NULL; struct aa_perms lperms = {}, perms; const char *info = NULL; u32 request = AA_MAY_LINK; aa_state_t state; int error; error = path_name(OP_LINK, subj_cred, &profile->label, link, profile->path_flags, buffer, &lname, cond, AA_MAY_LINK); if (error) goto audit; /* buffer2 freed below, tname is pointer in buffer2 */ error = path_name(OP_LINK, subj_cred, &profile->label, target, profile->path_flags, buffer2, &tname, cond, AA_MAY_LINK); if (error) goto audit; error = -EACCES; /* aa_str_perms - handles the case of the dfa being NULL */ state = aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE], lname, cond, &lperms); if (!(lperms.allow & AA_MAY_LINK)) goto audit; /* test to see if target can be paired with link */ state = aa_dfa_null_transition(rules->file->dfa, state); aa_str_perms(rules->file, state, tname, cond, &perms); /* force audit/quiet masks for link are stored in the second entry * in the link pair. */ lperms.audit = perms.audit; lperms.quiet = perms.quiet; lperms.kill = perms.kill; if (!(perms.allow & AA_MAY_LINK)) { info = "target restricted"; lperms = perms; goto audit; } /* done if link subset test is not required */ if (!(perms.allow & AA_LINK_SUBSET)) goto done_tests; /* Do link perm subset test requiring allowed permission on link are * a subset of the allowed permissions on target. */ aa_str_perms(rules->file, rules->file->start[AA_CLASS_FILE], tname, cond, &perms); /* AA_MAY_LINK is not considered in the subset test */ request = lperms.allow & ~AA_MAY_LINK; lperms.allow &= perms.allow | AA_MAY_LINK; request |= AA_AUDIT_FILE_MASK & (lperms.allow & ~perms.allow); if (request & ~lperms.allow) { goto audit; } else if ((lperms.allow & MAY_EXEC) && !xindex_is_subset(lperms.xindex, perms.xindex)) { lperms.allow &= ~MAY_EXEC; request |= MAY_EXEC; info = "link not subset of target"; goto audit; } done_tests: error = 0; audit: return aa_audit_file(subj_cred, profile, &lperms, OP_LINK, request, lname, tname, NULL, cond->uid, info, error); } /** * aa_path_link - Handle hard link permission check * @subj_cred: subject cred * @label: the label being enforced (NOT NULL) * @old_dentry: the target dentry (NOT NULL) * @new_dir: directory the new link will be created in (NOT NULL) * @new_dentry: the link being created (NOT NULL) * * Handle the permission test for a link & target pair. Permission * is encoded as a pair where the link permission is determined * first, and if allowed, the target is tested. The target test * is done from the point of the link match (not start of DFA) * making the target permission dependent on the link permission match. * * The subset test if required forces that permissions granted * on link are a subset of the permission granted to target. * * Returns: %0 if allowed else error */ int aa_path_link(const struct cred *subj_cred, struct aa_label *label, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct path link = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path target = { .mnt = new_dir->mnt, .dentry = old_dentry }; struct path_cond cond = { d_backing_inode(old_dentry)->i_uid, d_backing_inode(old_dentry)->i_mode }; char *buffer = NULL, *buffer2 = NULL; struct aa_profile *profile; int error; /* buffer freed below, lname is pointer in buffer */ buffer = aa_get_buffer(false); buffer2 = aa_get_buffer(false); error = -ENOMEM; if (!buffer || !buffer2) goto out; error = fn_for_each_confined(label, profile, profile_path_link(subj_cred, profile, &link, buffer, &target, buffer2, &cond)); out: aa_put_buffer(buffer); aa_put_buffer(buffer2); return error; } static void update_file_ctx(struct aa_file_ctx *fctx, struct aa_label *label, u32 request) { struct aa_label *l, *old; /* update caching of label on file_ctx */ spin_lock(&fctx->lock); old = rcu_dereference_protected(fctx->label, lockdep_is_held(&fctx->lock)); l = aa_label_merge(old, label, GFP_ATOMIC); if (l) { if (l != old) { rcu_assign_pointer(fctx->label, l); aa_put_label(old); } else aa_put_label(l); fctx->allow |= request; } spin_unlock(&fctx->lock); } static int __file_path_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct aa_label *flabel, struct file *file, u32 request, u32 denied, bool in_atomic) { struct aa_profile *profile; struct aa_perms perms = {}; vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(file), file_inode(file)); struct path_cond cond = { .uid = vfsuid_into_kuid(vfsuid), .mode = file_inode(file)->i_mode }; char *buffer; int flags, error; /* revalidation due to label out of date. No revocation at this time */ if (!denied && aa_label_is_subset(flabel, label)) /* TODO: check for revocation on stale profiles */ return 0; flags = PATH_DELEGATE_DELETED | (S_ISDIR(cond.mode) ? PATH_IS_DIR : 0); buffer = aa_get_buffer(in_atomic); if (!buffer) return -ENOMEM; /* check every profile in task label not in current cache */ error = fn_for_each_not_in_set(flabel, label, profile, profile_path_perm(op, subj_cred, profile, &file->f_path, buffer, request, &cond, flags, &perms)); if (denied && !error) { /* * check every profile in file label that was not tested * in the initial check above. * * TODO: cache full perms so this only happens because of * conditionals * TODO: don't audit here */ if (label == flabel) error = fn_for_each(label, profile, profile_path_perm(op, subj_cred, profile, &file->f_path, buffer, request, &cond, flags, &perms)); else error = fn_for_each_not_in_set(label, flabel, profile, profile_path_perm(op, subj_cred, profile, &file->f_path, buffer, request, &cond, flags, &perms)); } if (!error) update_file_ctx(file_ctx(file), label, request); aa_put_buffer(buffer); return error; } static int __file_sock_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct aa_label *flabel, struct file *file, u32 request, u32 denied) { struct socket *sock = (struct socket *) file->private_data; int error; AA_BUG(!sock); /* revalidation due to label out of date. No revocation at this time */ if (!denied && aa_label_is_subset(flabel, label)) return 0; /* TODO: improve to skip profiles cached in flabel */ error = aa_sock_file_perm(subj_cred, label, op, request, sock); if (denied) { /* TODO: improve to skip profiles checked above */ /* check every profile in file label to is cached */ last_error(error, aa_sock_file_perm(subj_cred, flabel, op, request, sock)); } if (!error) update_file_ctx(file_ctx(file), label, request); return error; } /** * aa_file_perm - do permission revalidation check & audit for @file * @op: operation being checked * @subj_cred: subject cred * @label: label being enforced (NOT NULL) * @file: file to revalidate access permissions on (NOT NULL) * @request: requested permissions * @in_atomic: whether allocations need to be done in atomic context * * Returns: %0 if access allowed else error */ int aa_file_perm(const char *op, const struct cred *subj_cred, struct aa_label *label, struct file *file, u32 request, bool in_atomic) { struct aa_file_ctx *fctx; struct aa_label *flabel; u32 denied; int error = 0; AA_BUG(!label); AA_BUG(!file); fctx = file_ctx(file); rcu_read_lock(); flabel = rcu_dereference(fctx->label); AA_BUG(!flabel); /* revalidate access, if task is unconfined, or the cached cred * doesn't match or if the request is for more permissions than * was granted. * * Note: the test for !unconfined(flabel) is to handle file * delegation from unconfined tasks */ denied = request & ~fctx->allow; if (unconfined(label) || unconfined(flabel) || (!denied && aa_label_is_subset(flabel, label))) { rcu_read_unlock(); goto done; } flabel = aa_get_newest_label(flabel); rcu_read_unlock(); /* TODO: label cross check */ if (file->f_path.mnt && path_mediated_fs(file->f_path.dentry)) error = __file_path_perm(op, subj_cred, label, flabel, file, request, denied, in_atomic); else if (S_ISSOCK(file_inode(file)->i_mode)) error = __file_sock_perm(op, subj_cred, label, flabel, file, request, denied); aa_put_label(flabel); done: return error; } static void revalidate_tty(const struct cred *subj_cred, struct aa_label *label) { struct tty_struct *tty; int drop_tty = 0; tty = get_current_tty(); if (!tty) return; spin_lock(&tty->files_lock); if (!list_empty(&tty->tty_files)) { struct tty_file_private *file_priv; struct file *file; /* TODO: Revalidate access to controlling tty. */ file_priv = list_first_entry(&tty->tty_files, struct tty_file_private, list); file = file_priv->file; if (aa_file_perm(OP_INHERIT, subj_cred, label, file, MAY_READ | MAY_WRITE, IN_ATOMIC)) drop_tty = 1; } spin_unlock(&tty->files_lock); tty_kref_put(tty); if (drop_tty) no_tty(); } struct cred_label { const struct cred *cred; struct aa_label *label; }; static int match_file(const void *p, struct file *file, unsigned int fd) { struct cred_label *cl = (struct cred_label *)p; if (aa_file_perm(OP_INHERIT, cl->cred, cl->label, file, aa_map_file_to_perms(file), IN_ATOMIC)) return fd + 1; return 0; } /* based on selinux's flush_unauthorized_files */ void aa_inherit_files(const struct cred *cred, struct files_struct *files) { struct aa_label *label = aa_get_newest_cred_label(cred); struct cred_label cl = { .cred = cred, .label = label, }; struct file *devnull = NULL; unsigned int n; revalidate_tty(cred, label); /* Revalidate access to inherited open files. */ n = iterate_fd(files, 0, match_file, &cl); if (!n) /* none found? */ goto out; devnull = dentry_open(&aa_null, O_RDWR, cred); if (IS_ERR(devnull)) devnull = NULL; /* replace all the matching ones with this */ do { replace_fd(n - 1, devnull, 0); } while ((n = iterate_fd(files, n, match_file, &cl)) != 0); if (devnull) fput(devnull); out: aa_put_label(label); } |
| 80 4 4 4 4 3 3 36 36 36 36 36 36 40 40 39 40 40 |